summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig80
-rw-r--r--mm/Kconfig.debug11
-rw-r--r--mm/Makefile9
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/bootmem_info.c4
-rw-r--r--mm/cma.c746
-rw-r--r--mm/cma.h47
-rw-r--r--mm/cma_debug.c61
-rw-r--r--mm/cma_sysfs.c20
-rw-r--r--mm/compaction.c98
-rw-r--r--mm/damon/core.c319
-rw-r--r--mm/damon/ops-common.c25
-rw-r--r--mm/damon/paddr.c89
-rw-r--r--mm/damon/sysfs-schemes.c179
-rw-r--r--mm/damon/sysfs.c357
-rw-r--r--mm/damon/tests/core-kunit.h6
-rw-r--r--mm/damon/vaddr.c1
-rw-r--r--mm/debug.c46
-rw-r--r--mm/execmem.c39
-rw-r--r--mm/filemap.c205
-rw-r--r--mm/folio-compat.c14
-rw-r--r--mm/gup.c35
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c1019
-rw-r--r--mm/hugetlb.c683
-rw-r--r--mm/hugetlb_cgroup.c31
-rw-r--r--mm/hugetlb_cma.c275
-rw-r--r--mm/hugetlb_cma.h57
-rw-r--r--mm/hugetlb_vmemmap.c199
-rw-r--r--mm/hugetlb_vmemmap.h23
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/internal.h109
-rw-r--r--mm/ioremap.c4
-rw-r--r--mm/kasan/kasan_test_c.c5
-rw-r--r--mm/kasan/report.c34
-rw-r--r--mm/khugepaged.c8
-rw-r--r--mm/kmemleak.c52
-rw-r--r--mm/kmsan/hooks.c1
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/list_lru.c15
-rw-r--r--mm/madvise.c240
-rw-r--r--mm/memblock.c72
-rw-r--r--mm/memcontrol-v1.c108
-rw-r--r--mm/memcontrol-v1.h52
-rw-r--r--mm/memcontrol.c359
-rw-r--r--mm/memfd.c4
-rw-r--r--mm/memory-failure.c90
-rw-r--r--mm/memory.c580
-rw-r--r--mm/memory_hotplug.c35
-rw-r--r--mm/mempolicy.c39
-rw-r--r--mm/memremap.c60
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/migrate_device.c147
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c196
-rw-r--r--mm/mmap.c64
-rw-r--r--mm/mmu_gather.c12
-rw-r--r--mm/mprotect.c16
-rw-r--r--mm/mremap.c1449
-rw-r--r--mm/nommu.c111
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c42
-rw-r--r--mm/page_alloc.c974
-rw-r--r--mm/page_counter.c4
-rw-r--r--mm/page_ext.c13
-rw-r--r--mm/page_idle.c9
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_isolation.c19
-rw-r--r--mm/page_owner.c94
-rw-r--r--mm/page_table_check.c44
-rw-r--r--mm/page_vma_mapped.c16
-rw-r--r--mm/percpu.c12
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c939
-rw-r--r--mm/shmem.c214
-rw-r--r--mm/show_mem.c4
-rw-r--r--mm/shrinker_debug.c8
-rw-r--r--mm/slab.h34
-rw-r--r--mm/slab_common.c62
-rw-r--r--mm/slub.c387
-rw-r--r--mm/sparse-vmemmap.c168
-rw-r--r--mm/sparse.c92
-rw-r--r--mm/swap.c18
-rw-r--r--mm/swap.h7
-rw-r--r--mm/swap_cgroup.c10
-rw-r--r--mm/swap_slots.c295
-rw-r--r--mm/swap_state.c91
-rw-r--r--mm/swapfile.c444
-rw-r--r--mm/truncate.c55
-rw-r--r--mm/usercopy.c18
-rw-r--r--mm/userfaultfd.c145
-rw-r--r--mm/util.c232
-rw-r--r--mm/vma.c367
-rw-r--r--mm/vma.h101
-rw-r--r--mm/vmalloc.c24
-rw-r--r--mm/vmscan.c275
-rw-r--r--mm/vmstat.c51
-rw-r--r--mm/z3fold.c1447
-rw-r--r--mm/zbud.c455
-rw-r--r--mm/zpool.c97
-rw-r--r--mm/zsmalloc.c498
-rw-r--r--mm/zswap.c231
104 files changed, 9563 insertions, 7036 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1b501db06417..e113f713b493 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,7 +129,6 @@ choice
prompt "Default allocator"
depends on ZSWAP
default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
- default ZSWAP_ZPOOL_DEFAULT_ZBUD
help
Selects the default allocator for the compressed cache for
swap pages.
@@ -140,21 +139,6 @@ choice
The selection made here can be overridden by using the kernel
command line 'zswap.zpool=' option.
-config ZSWAP_ZPOOL_DEFAULT_ZBUD
- bool "zbud"
- select ZBUD
- help
- Use the zbud allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
- bool "z3foldi (DEPRECATED)"
- select Z3FOLD_DEPRECATED
- help
- Use the z3fold allocator as the default allocator.
-
- Deprecated and scheduled for removal in a few cycles,
- see CONFIG_Z3FOLD_DEPRECATED.
-
config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
bool "zsmalloc"
select ZSMALLOC
@@ -165,40 +149,9 @@ endchoice
config ZSWAP_ZPOOL_DEFAULT
string
depends on ZSWAP
- default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
- default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
default ""
-config ZBUD
- tristate "2:1 compression allocator (zbud)"
- depends on ZSWAP
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to two compressed pages per physical
- page. While this design limits storage density, it has simple and
- deterministic reclaim properties that make it preferable to a higher
- density approach when reclaim will be used.
-
-config Z3FOLD_DEPRECATED
- tristate "3:1 compression allocator (z3fold) (DEPRECATED)"
- depends on ZSWAP
- help
- Deprecated and scheduled for removal in a few cycles. If you have
- a good reason for using Z3FOLD over ZSMALLOC, please contact
- linux-mm@kvack.org and the zswap maintainers.
-
- A special purpose allocator for storing compressed pages.
- It is designed to store up to three compressed pages per physical
- page. It is a ZBUD derivative so the simplicity and determinism are
- still there.
-
-config Z3FOLD
- tristate
- default y if Z3FOLD_DEPRECATED=y
- default m if Z3FOLD_DEPRECATED=m
- depends on Z3FOLD_DEPRECATED
-
config ZSMALLOC
tristate
prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
@@ -242,9 +195,13 @@ menu "Slab allocator options"
config SLUB
def_bool y
+config KVFREE_RCU_BATCHED
+ def_bool y
+ depends on !SLUB_TINY && !TINY_RCU
+
config SLUB_TINY
bool "Configure for minimal memory footprint"
- depends on EXPERT
+ depends on EXPERT && !COMPILE_TEST
select SLAB_MERGE_DEFAULT
help
Configures the slab allocator in a way to achieve minimal memory
@@ -489,6 +446,9 @@ config SPARSEMEM_VMEMMAP
SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+
+config SPARSEMEM_VMEMMAP_PREINIT
+ bool
#
# Select this config option from the architecture Kconfig, if it is preferred
# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
@@ -499,6 +459,9 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
+config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
+ bool
+
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -856,11 +819,15 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config MM_ID
+ def_bool n
+
menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
+ select MM_ID
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -924,8 +891,25 @@ config READ_ONLY_THP_FOR_FS
support of file THPs will be developed in the next few release
cycles.
+config NO_PAGE_MAPCOUNT
+ bool "No per-page mapcount (EXPERIMENTAL)"
+ help
+ Do not maintain per-page mapcounts for pages part of larger
+ allocations, such as transparent huge pages.
+
+ When this config option is enabled, some interfaces that relied on
+ this information will rely on less-precise per-allocation information
+ instead: for example, using the average per-page mapcount in such
+ a large allocation instead of the per-page mapcount.
+
+ EXPERIMENTAL because the impact of some changes is still unclear.
+
endif # TRANSPARENT_HUGEPAGE
+# simple helper to make the code a bit easier to read
+config PAGE_MAPCOUNT
+ def_bool !NO_PAGE_MAPCOUNT
+
#
# The architecture supports pgtable leaves that is larger than PAGE_SIZE
#
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 41a58536531d..32b65073d0cc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -186,8 +186,9 @@ config ARCH_HAS_DEBUG_WX
config DEBUG_WX
bool "Warn on W+X mappings at boot"
depends on ARCH_HAS_DEBUG_WX
+ depends on ARCH_HAS_PTDUMP
depends on MMU
- select PTDUMP_CORE
+ select PTDUMP
help
Generate a warning if any W+X mappings are found at boot.
@@ -212,18 +213,18 @@ config DEBUG_WX
If in doubt, say "Y".
-config GENERIC_PTDUMP
+config ARCH_HAS_PTDUMP
bool
-config PTDUMP_CORE
+config PTDUMP
bool
config PTDUMP_DEBUGFS
bool "Export kernel pagetable layout to userspace via debugfs"
depends on DEBUG_KERNEL
depends on DEBUG_FS
- depends on GENERIC_PTDUMP
- select PTDUMP_CORE
+ depends on ARCH_HAS_PTDUMP
+ select PTDUMP
help
Say Y here if you want to show the kernel pagetable layout in a
debugfs file. This information is only useful for kernel developers
diff --git a/mm/Makefile b/mm/Makefile
index 850386a67b3e..e7f6bbf8ae5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,10 +75,13 @@ ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+ifdef CONFIG_CMA
+obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o
+endif
obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
@@ -113,9 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
-obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
-obj-$(CONFIG_Z3FOLD) += z3fold.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_NUMA) += numa.o
@@ -138,7 +139,7 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
-obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..783904d8c5ef 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- del_timer_sync(&bdi->laptop_mode_wb_timer);
+ timer_delete_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6597ebea8ae2..d3e00731e262 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,6 +24,7 @@ static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
balloon_page_insert(b_dev_info, page);
unlock_page(page);
__count_vm_event(BALLOON_INFLATE);
+ inc_node_page_state(page, NR_BALLOON_PAGES);
}
/**
@@ -103,6 +104,7 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
__count_vm_event(BALLOON_DEFLATE);
list_add(&page->lru, pages);
unlock_page(page);
+ dec_node_page_state(page, NR_BALLOON_PAGES);
n_pages++;
}
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index 95f288169a38..b0e2a9fa641f 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -88,7 +88,9 @@ static void __init register_page_bootmem_info_section(unsigned long start_pfn)
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+ if (!preinited_vmemmap_section(ms))
+ register_page_bootmem_memmap(section_nr, memmap,
+ PAGES_PER_SECTION);
usage = ms->usage;
page = virt_to_page(usage);
diff --git a/mm/cma.c b/mm/cma.c
index de5bc0c81fc2..b06d5fe73399 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -18,6 +18,7 @@
#include <linux/memblock.h>
#include <linux/err.h>
+#include <linux/list.h>
#include <linux/mm.h>
#include <linux/sizes.h>
#include <linux/slab.h>
@@ -33,11 +34,17 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static DEFINE_MUTEX(cma_mutex);
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t base,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid);
phys_addr_t cma_get_base(const struct cma *cma)
{
- return PFN_PHYS(cma->base_pfn);
+ WARN_ON_ONCE(cma->nranges != 1);
+ return PFN_PHYS(cma->ranges[0].base_pfn);
}
unsigned long cma_get_size(const struct cma *cma)
@@ -63,9 +70,10 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
* The value returned is represented in order_per_bits.
*/
static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ const struct cma_memrange *cmr,
unsigned int align_order)
{
- return (cma->base_pfn & ((1UL << align_order) - 1))
+ return (cmr->base_pfn & ((1UL << align_order) - 1))
>> cma->order_per_bit;
}
@@ -75,65 +83,122 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
- unsigned long count)
+static void cma_clear_bitmap(struct cma *cma, const struct cma_memrange *cmr,
+ unsigned long pfn, unsigned long count)
{
unsigned long bitmap_no, bitmap_count;
unsigned long flags;
- bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
+ bitmap_no = (pfn - cmr->base_pfn) >> cma->order_per_bit;
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
spin_lock_irqsave(&cma->lock, flags);
- bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
+ bitmap_clear(cmr->bitmap, bitmap_no, bitmap_count);
+ cma->available_count += count;
spin_unlock_irqrestore(&cma->lock, flags);
}
-static void __init cma_activate_area(struct cma *cma)
+/*
+ * Check if a CMA area contains no ranges that intersect with
+ * multiple zones. Store the result in the flags in case
+ * this gets called more than once.
+ */
+bool cma_validate_zones(struct cma *cma)
{
- unsigned long base_pfn = cma->base_pfn, pfn;
- struct zone *zone;
-
- cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap)
- goto out_error;
+ int r;
+ unsigned long base_pfn;
+ struct cma_memrange *cmr;
+ bool valid_bit_set;
/*
- * alloc_contig_range() requires the pfn range specified to be in the
- * same zone. Simplify by forcing the entire CMA resv range to be in the
- * same zone.
+ * If already validated, return result of previous check.
+ * Either the valid or invalid bit will be set if this
+ * check has already been done. If neither is set, the
+ * check has not been performed yet.
*/
- WARN_ON_ONCE(!pfn_valid(base_pfn));
- zone = page_zone(pfn_to_page(base_pfn));
- for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
- if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
+ valid_bit_set = test_bit(CMA_ZONES_VALID, &cma->flags);
+ if (valid_bit_set || test_bit(CMA_ZONES_INVALID, &cma->flags))
+ return valid_bit_set;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ base_pfn = cmr->base_pfn;
+
+ /*
+ * alloc_contig_range() requires the pfn range specified
+ * to be in the same zone. Simplify by forcing the entire
+ * CMA resv range to be in the same zone.
+ */
+ WARN_ON_ONCE(!pfn_valid(base_pfn));
+ if (pfn_range_intersects_zones(cma->nid, base_pfn, cmr->count)) {
+ set_bit(CMA_ZONES_INVALID, &cma->flags);
+ return false;
+ }
+ }
+
+ set_bit(CMA_ZONES_VALID, &cma->flags);
+
+ return true;
+}
+
+static void __init cma_activate_area(struct cma *cma)
+{
+ unsigned long pfn, end_pfn;
+ int allocrange, r;
+ struct cma_memrange *cmr;
+ unsigned long bitmap_count, count;
+
+ for (allocrange = 0; allocrange < cma->nranges; allocrange++) {
+ cmr = &cma->ranges[allocrange];
+ cmr->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma, cmr),
+ GFP_KERNEL);
+ if (!cmr->bitmap)
+ goto cleanup;
}
- for (pfn = base_pfn; pfn < base_pfn + cma->count;
- pfn += pageblock_nr_pages)
- init_cma_reserved_pageblock(pfn_to_page(pfn));
+ if (!cma_validate_zones(cma))
+ goto cleanup;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ if (cmr->early_pfn != cmr->base_pfn) {
+ count = cmr->early_pfn - cmr->base_pfn;
+ bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+ bitmap_set(cmr->bitmap, 0, bitmap_count);
+ }
+
+ for (pfn = cmr->early_pfn; pfn < cmr->base_pfn + cmr->count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
spin_lock_init(&cma->lock);
+ mutex_init(&cma->alloc_mutex);
+
#ifdef CONFIG_CMA_DEBUGFS
INIT_HLIST_HEAD(&cma->mem_head);
spin_lock_init(&cma->mem_head_lock);
#endif
+ set_bit(CMA_ACTIVATED, &cma->flags);
return;
-not_in_zone:
- bitmap_free(cma->bitmap);
-out_error:
+cleanup:
+ for (r = 0; r < allocrange; r++)
+ bitmap_free(cma->ranges[r].bitmap);
+
/* Expose all pages to the buddy, they are useless for CMA. */
- if (!cma->reserve_pages_on_error) {
- for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
- free_reserved_page(pfn_to_page(pfn));
+ if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) {
+ for (r = 0; r < allocrange; r++) {
+ cmr = &cma->ranges[r];
+ end_pfn = cmr->base_pfn + cmr->count;
+ for (pfn = cmr->early_pfn; pfn < end_pfn; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ }
}
totalcma_pages -= cma->count;
- cma->count = 0;
+ cma->available_count = cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
}
@@ -150,7 +215,44 @@ core_initcall(cma_init_reserved_areas);
void __init cma_reserve_pages_on_error(struct cma *cma)
{
- cma->reserve_pages_on_error = true;
+ set_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags);
+}
+
+static int __init cma_new_area(const char *name, phys_addr_t size,
+ unsigned int order_per_bit,
+ struct cma **res_cma)
+{
+ struct cma *cma;
+
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ cma = &cma_areas[cma_area_count];
+ cma_area_count++;
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, "%s", name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
+ cma->available_count = cma->count = size >> PAGE_SHIFT;
+ cma->order_per_bit = order_per_bit;
+ *res_cma = cma;
+ totalcma_pages += cma->count;
+
+ return 0;
+}
+
+static void __init cma_drop_area(struct cma *cma)
+{
+ totalcma_pages -= cma->count;
+ cma_area_count--;
}
/**
@@ -171,13 +273,9 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
struct cma **res_cma)
{
struct cma *cma;
+ int ret;
/* Sanity checks */
- if (cma_area_count == ARRAY_SIZE(cma_areas)) {
- pr_err("Not enough slots for CMA reserved regions!\n");
- return -ENOSPC;
- }
-
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
@@ -194,25 +292,264 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
+ ret = cma_new_area(name, size, order_per_bit, &cma);
+ if (ret != 0)
+ return ret;
+
+ cma->ranges[0].base_pfn = PFN_DOWN(base);
+ cma->ranges[0].early_pfn = PFN_DOWN(base);
+ cma->ranges[0].count = cma->count;
+ cma->nranges = 1;
+ cma->nid = NUMA_NO_NODE;
+
+ *res_cma = cma;
+
+ return 0;
+}
+
+/*
+ * Structure used while walking physical memory ranges and finding out
+ * which one(s) to use for a CMA area.
+ */
+struct cma_init_memrange {
+ phys_addr_t base;
+ phys_addr_t size;
+ struct list_head list;
+};
+
+/*
+ * Work array used during CMA initialization.
+ */
+static struct cma_init_memrange memranges[CMA_MAX_RANGES] __initdata;
+
+static bool __init revsizecmp(struct cma_init_memrange *mlp,
+ struct cma_init_memrange *mrp)
+{
+ return mlp->size > mrp->size;
+}
+
+static bool __init basecmp(struct cma_init_memrange *mlp,
+ struct cma_init_memrange *mrp)
+{
+ return mlp->base < mrp->base;
+}
+
+/*
+ * Helper function to create sorted lists.
+ */
+static void __init list_insert_sorted(
+ struct list_head *ranges,
+ struct cma_init_memrange *mrp,
+ bool (*cmp)(struct cma_init_memrange *lh, struct cma_init_memrange *rh))
+{
+ struct list_head *mp;
+ struct cma_init_memrange *mlp;
+
+ if (list_empty(ranges))
+ list_add(&mrp->list, ranges);
+ else {
+ list_for_each(mp, ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ if (cmp(mlp, mrp))
+ break;
+ }
+ __list_add(&mrp->list, mlp->list.prev, &mlp->list);
+ }
+}
+
+/*
+ * Create CMA areas with a total size of @total_size. A normal allocation
+ * for one area is tried first. If that fails, the biggest memblock
+ * ranges above 4G are selected, and allocated bottom up.
+ *
+ * The complexity here is not great, but this function will only be
+ * called during boot, and the lists operated on have fewer than
+ * CMA_MAX_RANGES elements (default value: 8).
+ */
+int __init cma_declare_contiguous_multi(phys_addr_t total_size,
+ phys_addr_t align, unsigned int order_per_bit,
+ const char *name, struct cma **res_cma, int nid)
+{
+ phys_addr_t start, end;
+ phys_addr_t size, sizesum, sizeleft;
+ struct cma_init_memrange *mrp, *mlp, *failed;
+ struct cma_memrange *cmrp;
+ LIST_HEAD(ranges);
+ LIST_HEAD(final_ranges);
+ struct list_head *mp, *next;
+ int ret, nr = 1;
+ u64 i;
+ struct cma *cma;
+
/*
- * Each reserved area must be initialised later, when more kernel
- * subsystems (like slab allocator) are available.
+ * First, try it the normal way, producing just one range.
*/
- cma = &cma_areas[cma_area_count];
+ ret = __cma_declare_contiguous_nid(0, total_size, 0, align,
+ order_per_bit, false, name, res_cma, nid);
+ if (ret != -ENOMEM)
+ goto out;
- if (name)
- snprintf(cma->name, CMA_MAX_NAME, name);
- else
- snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+ /*
+ * Couldn't find one range that fits our needs, so try multiple
+ * ranges.
+ *
+ * No need to do the alignment checks here, the call to
+ * cma_declare_contiguous_nid above would have caught
+ * any issues. With the checks, we know that:
+ *
+ * - @align is a power of 2
+ * - @align is >= pageblock alignment
+ * - @size is aligned to @align and to @order_per_bit
+ *
+ * So, as long as we create ranges that have a base
+ * aligned to @align, and a size that is aligned to
+ * both @align and @order_to_bit, things will work out.
+ */
+ nr = 0;
+ sizesum = 0;
+ failed = NULL;
- cma->base_pfn = PFN_DOWN(base);
- cma->count = size >> PAGE_SHIFT;
- cma->order_per_bit = order_per_bit;
+ ret = cma_new_area(name, total_size, order_per_bit, &cma);
+ if (ret != 0)
+ goto out;
+
+ align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES);
+ /*
+ * Create a list of ranges above 4G, largest range first.
+ */
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (upper_32_bits(start) == 0)
+ continue;
+
+ start = ALIGN(start, align);
+ if (start >= end)
+ continue;
+
+ end = ALIGN_DOWN(end, align);
+ if (end <= start)
+ continue;
+
+ size = end - start;
+ size = ALIGN_DOWN(size, (PAGE_SIZE << order_per_bit));
+ if (!size)
+ continue;
+ sizesum += size;
+
+ pr_debug("consider %016llx - %016llx\n", (u64)start, (u64)end);
+
+ /*
+ * If we don't yet have used the maximum number of
+ * areas, grab a new one.
+ *
+ * If we can't use anymore, see if this range is not
+ * smaller than the smallest one already recorded. If
+ * not, re-use the smallest element.
+ */
+ if (nr < CMA_MAX_RANGES)
+ mrp = &memranges[nr++];
+ else {
+ mrp = list_last_entry(&ranges,
+ struct cma_init_memrange, list);
+ if (size < mrp->size)
+ continue;
+ list_del(&mrp->list);
+ sizesum -= mrp->size;
+ pr_debug("deleted %016llx - %016llx from the list\n",
+ (u64)mrp->base, (u64)mrp->base + size);
+ }
+ mrp->base = start;
+ mrp->size = size;
+
+ /*
+ * Now do a sorted insert.
+ */
+ list_insert_sorted(&ranges, mrp, revsizecmp);
+ pr_debug("added %016llx - %016llx to the list\n",
+ (u64)mrp->base, (u64)mrp->base + size);
+ pr_debug("total size now %llu\n", (u64)sizesum);
+ }
+
+ /*
+ * There is not enough room in the CMA_MAX_RANGES largest
+ * ranges, so bail out.
+ */
+ if (sizesum < total_size) {
+ cma_drop_area(cma);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Found ranges that provide enough combined space.
+ * Now, sorted them by address, smallest first, because we
+ * want to mimic a bottom-up memblock allocation.
+ */
+ sizesum = 0;
+ list_for_each_safe(mp, next, &ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ list_del(mp);
+ list_insert_sorted(&final_ranges, mlp, basecmp);
+ sizesum += mlp->size;
+ if (sizesum >= total_size)
+ break;
+ }
+
+ /*
+ * Walk the final list, and add a CMA range for
+ * each range, possibly not using the last one fully.
+ */
+ nr = 0;
+ sizeleft = total_size;
+ list_for_each(mp, &final_ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ size = min(sizeleft, mlp->size);
+ if (memblock_reserve(mlp->base, size)) {
+ /*
+ * Unexpected error. Could go on to
+ * the next one, but just abort to
+ * be safe.
+ */
+ failed = mlp;
+ break;
+ }
+
+ pr_debug("created region %d: %016llx - %016llx\n",
+ nr, (u64)mlp->base, (u64)mlp->base + size);
+ cmrp = &cma->ranges[nr++];
+ cmrp->base_pfn = PHYS_PFN(mlp->base);
+ cmrp->early_pfn = cmrp->base_pfn;
+ cmrp->count = size >> PAGE_SHIFT;
+
+ sizeleft -= size;
+ if (sizeleft == 0)
+ break;
+ }
+
+ if (failed) {
+ list_for_each(mp, &final_ranges) {
+ mlp = list_entry(mp, struct cma_init_memrange, list);
+ if (mlp == failed)
+ break;
+ memblock_phys_free(mlp->base, mlp->size);
+ }
+ cma_drop_area(cma);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cma->nranges = nr;
+ cma->nid = nid;
*res_cma = cma;
- cma_area_count++;
- totalcma_pages += cma->count;
- return 0;
+out:
+ if (ret != 0)
+ pr_err("Failed to reserve %lu MiB\n",
+ (unsigned long)total_size / SZ_1M);
+ else
+ pr_info("Reserved %lu MiB in %d range%s\n",
+ (unsigned long)total_size / SZ_1M, nr,
+ nr > 1 ? "s" : "");
+ return ret;
}
/**
@@ -241,6 +578,26 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
bool fixed, const char *name, struct cma **res_cma,
int nid)
{
+ int ret;
+
+ ret = __cma_declare_contiguous_nid(base, size, limit, alignment,
+ order_per_bit, fixed, name, res_cma, nid);
+ if (ret != 0)
+ pr_err("Failed to reserve %ld MiB\n",
+ (unsigned long)size / SZ_1M);
+ else
+ pr_info("Reserved %ld MiB at %pa\n",
+ (unsigned long)size / SZ_1M, &base);
+
+ return ret;
+}
+
+static int __init __cma_declare_contiguous_nid(phys_addr_t base,
+ phys_addr_t size, phys_addr_t limit,
+ phys_addr_t alignment, unsigned int order_per_bit,
+ bool fixed, const char *name, struct cma **res_cma,
+ int nid)
+{
phys_addr_t memblock_end = memblock_end_of_DRAM();
phys_addr_t highmem_start;
int ret;
@@ -272,10 +629,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
/* Sanitise input arguments. */
alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
- ret = -EINVAL;
pr_err("Region at %pa must be aligned to %pa bytes\n",
&base, &alignment);
- goto err;
+ return -EINVAL;
}
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
@@ -293,10 +649,9 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
* low/high memory boundary.
*/
if (fixed && base < highmem_start && base + size > highmem_start) {
- ret = -EINVAL;
pr_err("Region at %pa defined on low/high memory boundary (%pa)\n",
&base, &highmem_start);
- goto err;
+ return -EINVAL;
}
/*
@@ -308,18 +663,16 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
limit = memblock_end;
if (base + size > limit) {
- ret = -EINVAL;
pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
&size, &base, &limit);
- goto err;
+ return -EINVAL;
}
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
memblock_reserve(base, size) < 0) {
- ret = -EBUSY;
- goto err;
+ return -EBUSY;
}
} else {
phys_addr_t addr = 0;
@@ -356,10 +709,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
limit, nid, true);
- if (!addr) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!addr)
+ return -ENOMEM;
}
/*
@@ -372,86 +723,89 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
- goto free_mem;
+ memblock_phys_free(base, size);
- pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M,
- &base, nid);
- return 0;
+ (*res_cma)->nid = nid;
-free_mem:
- memblock_phys_free(base, size);
-err:
- pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M,
- nid);
return ret;
}
static void cma_debug_show_areas(struct cma *cma)
{
unsigned long next_zero_bit, next_set_bit, nr_zero;
- unsigned long start = 0;
- unsigned long nr_part, nr_total = 0;
- unsigned long nbits = cma_bitmap_maxno(cma);
+ unsigned long start;
+ unsigned long nr_part;
+ unsigned long nbits;
+ int r;
+ struct cma_memrange *cmr;
spin_lock_irq(&cma->lock);
pr_info("number of available pages: ");
- for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
- if (next_zero_bit >= nbits)
- break;
- next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
- nr_zero = next_set_bit - next_zero_bit;
- nr_part = nr_zero << cma->order_per_bit;
- pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
- next_zero_bit);
- nr_total += nr_part;
- start = next_zero_bit + nr_zero;
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+
+ start = 0;
+ nbits = cma_bitmap_maxno(cma, cmr);
+
+ pr_info("range %d: ", r);
+ for (;;) {
+ next_zero_bit = find_next_zero_bit(cmr->bitmap,
+ nbits, start);
+ if (next_zero_bit >= nbits)
+ break;
+ next_set_bit = find_next_bit(cmr->bitmap, nbits,
+ next_zero_bit);
+ nr_zero = next_set_bit - next_zero_bit;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", start ? "+" : "", nr_part,
+ next_zero_bit);
+ start = next_zero_bit + nr_zero;
+ }
+ pr_info("\n");
}
- pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", cma->available_count,
+ cma->count);
spin_unlock_irq(&cma->lock);
}
-static struct page *__cma_alloc(struct cma *cma, unsigned long count,
- unsigned int align, gfp_t gfp)
+static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
+ unsigned long count, unsigned int align,
+ struct page **pagep, gfp_t gfp)
{
unsigned long mask, offset;
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
- unsigned long i;
+ int ret = -EBUSY;
struct page *page = NULL;
- int ret = -ENOMEM;
- const char *name = cma ? cma->name : NULL;
-
- trace_cma_alloc_start(name, count, align);
-
- if (!cma || !cma->count || !cma->bitmap)
- return page;
-
- pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
- (void *)cma, cma->name, count, align);
-
- if (!count)
- return page;
mask = cma_bitmap_aligned_mask(cma, align);
- offset = cma_bitmap_aligned_offset(cma, align);
- bitmap_maxno = cma_bitmap_maxno(cma);
+ offset = cma_bitmap_aligned_offset(cma, cmr, align);
+ bitmap_maxno = cma_bitmap_maxno(cma, cmr);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
if (bitmap_count > bitmap_maxno)
- return page;
+ goto out;
for (;;) {
spin_lock_irq(&cma->lock);
- bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
+ /*
+ * If the request is larger than the available number
+ * of pages, stop right away.
+ */
+ if (count > cma->available_count) {
+ spin_unlock_irq(&cma->lock);
+ break;
+ }
+ bitmap_no = bitmap_find_next_zero_area_off(cmr->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
spin_unlock_irq(&cma->lock);
break;
}
- bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
+ bitmap_set(cmr->bitmap, bitmap_no, bitmap_count);
+ cma->available_count -= count;
/*
* It's safe to drop the lock here. We've marked this region for
* our exclusive use. If the migration fails we will take the
@@ -459,16 +813,16 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
*/
spin_unlock_irq(&cma->lock);
- pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
- mutex_lock(&cma_mutex);
+ pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
+ mutex_lock(&cma->alloc_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
- mutex_unlock(&cma_mutex);
+ mutex_unlock(&cma->alloc_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
break;
}
- cma_clear_bitmap(cma, pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
if (ret != -EBUSY)
break;
@@ -480,6 +834,38 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
+out:
+ *pagep = page;
+ return ret;
+}
+
+static struct page *__cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, gfp_t gfp)
+{
+ struct page *page = NULL;
+ int ret = -ENOMEM, r;
+ unsigned long i;
+ const char *name = cma ? cma->name : NULL;
+
+ trace_cma_alloc_start(name, count, align);
+
+ if (!cma || !cma->count)
+ return page;
+
+ pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__,
+ (void *)cma, cma->name, count, align);
+
+ if (!count)
+ return page;
+
+ for (r = 0; r < cma->nranges; r++) {
+ page = NULL;
+
+ ret = cma_range_alloc(cma, &cma->ranges[r], count, align,
+ &page, gfp);
+ if (ret != -EBUSY || page)
+ break;
+ }
/*
* CMA can allocate multiple page blocks, which results in different
@@ -498,7 +884,8 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
}
pr_debug("%s(): returned %p\n", __func__, page);
- trace_cma_alloc_finish(name, pfn, page, count, align, ret);
+ trace_cma_alloc_finish(name, page ? page_to_pfn(page) : 0,
+ page, count, align, ret);
if (page) {
count_vm_event(CMA_ALLOC_SUCCESS);
cma_sysfs_account_success_pages(cma, count);
@@ -541,20 +928,31 @@ struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
bool cma_pages_valid(struct cma *cma, const struct page *pages,
unsigned long count)
{
- unsigned long pfn;
+ unsigned long pfn, end;
+ int r;
+ struct cma_memrange *cmr;
+ bool ret;
- if (!cma || !pages)
+ if (!cma || !pages || count > cma->count)
return false;
pfn = page_to_pfn(pages);
+ ret = false;
- if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
- pr_debug("%s(page %p, count %lu)\n", __func__,
- (void *)pages, count);
- return false;
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ end = cmr->base_pfn + cmr->count;
+ if (pfn >= cmr->base_pfn && pfn < end) {
+ ret = pfn + count <= end;
+ break;
+ }
}
- return true;
+ if (!ret)
+ pr_debug("%s(page %p, count %lu)\n",
+ __func__, (void *)pages, count);
+
+ return ret;
}
/**
@@ -570,19 +968,32 @@ bool cma_pages_valid(struct cma *cma, const struct page *pages,
bool cma_release(struct cma *cma, const struct page *pages,
unsigned long count)
{
- unsigned long pfn;
+ struct cma_memrange *cmr;
+ unsigned long pfn, end_pfn;
+ int r;
+
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
if (!cma_pages_valid(cma, pages, count))
return false;
- pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
-
pfn = page_to_pfn(pages);
+ end_pfn = pfn + count;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ if (pfn >= cmr->base_pfn &&
+ pfn < (cmr->base_pfn + cmr->count)) {
+ VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count);
+ break;
+ }
+ }
- VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
+ if (r == cma->nranges)
+ return false;
free_contig_range(pfn, count);
- cma_clear_bitmap(cma, pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
cma_sysfs_account_release_pages(cma, count);
trace_cma_release(cma->name, pfn, pages, count);
@@ -610,3 +1021,86 @@ int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
return 0;
}
+
+bool cma_intersects(struct cma *cma, unsigned long start, unsigned long end)
+{
+ int r;
+ struct cma_memrange *cmr;
+ unsigned long rstart, rend;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+
+ rstart = PFN_PHYS(cmr->base_pfn);
+ rend = PFN_PHYS(cmr->base_pfn + cmr->count);
+ if (end < rstart)
+ continue;
+ if (start >= rend)
+ continue;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Very basic function to reserve memory from a CMA area that has not
+ * yet been activated. This is expected to be called early, when the
+ * system is single-threaded, so there is no locking. The alignment
+ * checking is restrictive - only pageblock-aligned areas
+ * (CMA_MIN_ALIGNMENT_BYTES) may be reserved through this function.
+ * This keeps things simple, and is enough for the current use case.
+ *
+ * The CMA bitmaps have not yet been allocated, so just start
+ * reserving from the bottom up, using a PFN to keep track
+ * of what has been reserved. Unreserving is not possible.
+ *
+ * The caller is responsible for initializing the page structures
+ * in the area properly, since this just points to memblock-allocated
+ * memory. The caller should subsequently use init_cma_pageblock to
+ * set the migrate type and CMA stats the pageblocks that were reserved.
+ *
+ * If the CMA area fails to activate later, memory obtained through
+ * this interface is not handed to the page allocator, this is
+ * the responsibility of the caller (e.g. like normal memblock-allocated
+ * memory).
+ */
+void __init *cma_reserve_early(struct cma *cma, unsigned long size)
+{
+ int r;
+ struct cma_memrange *cmr;
+ unsigned long available;
+ void *ret = NULL;
+
+ if (!cma || !cma->count)
+ return NULL;
+ /*
+ * Can only be called early in init.
+ */
+ if (test_bit(CMA_ACTIVATED, &cma->flags))
+ return NULL;
+
+ if (!IS_ALIGNED(size, CMA_MIN_ALIGNMENT_BYTES))
+ return NULL;
+
+ if (!IS_ALIGNED(size, (PAGE_SIZE << cma->order_per_bit)))
+ return NULL;
+
+ size >>= PAGE_SHIFT;
+
+ if (size > cma->available_count)
+ return NULL;
+
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ available = cmr->count - (cmr->early_pfn - cmr->base_pfn);
+ if (size <= available) {
+ ret = phys_to_virt(PFN_PHYS(cmr->early_pfn));
+ cmr->early_pfn += size;
+ cma->available_count -= size;
+ return ret;
+ }
+ }
+
+ return ret;
+}
diff --git a/mm/cma.h b/mm/cma.h
index 8485ef893e99..41a3ab0ec3de 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -10,18 +10,43 @@ struct cma_kobject {
struct cma *cma;
};
+/*
+ * Multi-range support. This can be useful if the size of the allocation
+ * is not expected to be larger than the alignment (like with hugetlb_cma),
+ * and the total amount of memory requested, while smaller than the total
+ * amount of memory available, is large enough that it doesn't fit in a
+ * single physical memory range because of memory holes.
+ *
+ * Fields:
+ * @base_pfn: physical address of range
+ * @early_pfn: first PFN not reserved through cma_reserve_early
+ * @count: size of range
+ * @bitmap: bitmap of allocated (1 << order_per_bit)-sized chunks.
+ */
+struct cma_memrange {
+ unsigned long base_pfn;
+ unsigned long early_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+#ifdef CONFIG_CMA_DEBUGFS
+ struct debugfs_u32_array dfs_bitmap;
+#endif
+};
+#define CMA_MAX_RANGES 8
+
struct cma {
- unsigned long base_pfn;
unsigned long count;
- unsigned long *bitmap;
+ unsigned long available_count;
unsigned int order_per_bit; /* Order of pages represented by one bit */
spinlock_t lock;
+ struct mutex alloc_mutex;
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
- struct debugfs_u32_array dfs_bitmap;
#endif
char name[CMA_MAX_NAME];
+ int nranges;
+ struct cma_memrange ranges[CMA_MAX_RANGES];
#ifdef CONFIG_CMA_SYSFS
/* the number of CMA page successful allocations */
atomic64_t nr_pages_succeeded;
@@ -32,15 +57,25 @@ struct cma {
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
- bool reserve_pages_on_error;
+ unsigned long flags;
+ /* NUMA node (NUMA_NO_NODE if unspecified) */
+ int nid;
+};
+
+enum cma_flags {
+ CMA_RESERVE_PAGES_ON_ERROR,
+ CMA_ZONES_VALID,
+ CMA_ZONES_INVALID,
+ CMA_ACTIVATED,
};
extern struct cma cma_areas[MAX_CMA_AREAS];
extern unsigned int cma_area_count;
-static inline unsigned long cma_bitmap_maxno(struct cma *cma)
+static inline unsigned long cma_bitmap_maxno(struct cma *cma,
+ struct cma_memrange *cmr)
{
- return cma->count >> cma->order_per_bit;
+ return cmr->count >> cma->order_per_bit;
}
#ifdef CONFIG_CMA_SYSFS
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 602fff89b15f..fdf899532ca0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -34,13 +34,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
static int cma_used_get(void *data, u64 *val)
{
struct cma *cma = data;
- unsigned long used;
spin_lock_irq(&cma->lock);
- /* pages counter is smaller than sizeof(int) */
- used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
+ *val = cma->count - cma->available_count;
spin_unlock_irq(&cma->lock);
- *val = (u64)used << cma->order_per_bit;
return 0;
}
@@ -49,17 +46,26 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
static int cma_maxchunk_get(void *data, u64 *val)
{
struct cma *cma = data;
+ struct cma_memrange *cmr;
unsigned long maxchunk = 0;
- unsigned long start, end = 0;
- unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
+ unsigned long start, end;
+ unsigned long bitmap_maxno;
+ int r;
spin_lock_irq(&cma->lock);
- for (;;) {
- start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= bitmap_maxno)
- break;
- end = find_next_bit(cma->bitmap, bitmap_maxno, start);
- maxchunk = max(end - start, maxchunk);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ bitmap_maxno = cma_bitmap_maxno(cma, cmr);
+ end = 0;
+ for (;;) {
+ start = find_next_zero_bit(cmr->bitmap,
+ bitmap_maxno, end);
+ if (start >= bitmap_maxno)
+ break;
+ end = find_next_bit(cmr->bitmap, bitmap_maxno,
+ start);
+ maxchunk = max(end - start, maxchunk);
+ }
}
spin_unlock_irq(&cma->lock);
*val = (u64)maxchunk << cma->order_per_bit;
@@ -162,24 +168,41 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
- struct dentry *tmp;
+ struct dentry *tmp, *dir, *rangedir;
+ int r;
+ char rdirname[12];
+ struct cma_memrange *cmr;
tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
- debugfs_create_file("base_pfn", 0444, tmp,
- &cma->base_pfn, &cma_debugfs_fops);
debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops);
debugfs_create_file("order_per_bit", 0444, tmp,
&cma->order_per_bit, &cma_debugfs_fops);
debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
- cma->dfs_bitmap.array = (u32 *)cma->bitmap;
- cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma),
- BITS_PER_BYTE * sizeof(u32));
- debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap);
+ rangedir = debugfs_create_dir("ranges", tmp);
+ for (r = 0; r < cma->nranges; r++) {
+ cmr = &cma->ranges[r];
+ snprintf(rdirname, sizeof(rdirname), "%d", r);
+ dir = debugfs_create_dir(rdirname, rangedir);
+ debugfs_create_file("base_pfn", 0444, dir,
+ &cmr->base_pfn, &cma_debugfs_fops);
+ cmr->dfs_bitmap.array = (u32 *)cmr->bitmap;
+ cmr->dfs_bitmap.n_elements =
+ DIV_ROUND_UP(cma_bitmap_maxno(cma, cmr),
+ BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", 0444, dir,
+ &cmr->dfs_bitmap);
+ }
+
+ /*
+ * Backward compatible symlinks to range 0 for base_pfn and bitmap.
+ */
+ debugfs_create_symlink("base_pfn", tmp, "ranges/0/base_pfn");
+ debugfs_create_symlink("bitmap", tmp, "ranges/0/bitmap");
}
static int __init cma_debugfs_init(void)
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
index f50db3973171..97acd3e5a6a5 100644
--- a/mm/cma_sysfs.c
+++ b/mm/cma_sysfs.c
@@ -62,6 +62,24 @@ static ssize_t release_pages_success_show(struct kobject *kobj,
}
CMA_ATTR_RO(release_pages_success);
+static ssize_t total_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%lu\n", cma->count);
+}
+CMA_ATTR_RO(total_pages);
+
+static ssize_t available_pages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct cma *cma = cma_from_kobj(kobj);
+
+ return sysfs_emit(buf, "%lu\n", cma->available_count);
+}
+CMA_ATTR_RO(available_pages);
+
static void cma_kobj_release(struct kobject *kobj)
{
struct cma *cma = cma_from_kobj(kobj);
@@ -75,6 +93,8 @@ static struct attribute *cma_attrs[] = {
&alloc_pages_success_attr.attr,
&alloc_pages_fail_attr.attr,
&release_pages_success_attr.attr,
+ &total_pages_attr.attr,
+ &available_pages_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(cma);
diff --git a/mm/compaction.c b/mm/compaction.c
index 12ed8425fa17..139f00c0308a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2328,11 +2328,27 @@ static enum compact_result __compact_finished(struct compact_control *cc)
if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
+ /*
+ * When defrag_mode is enabled, make kcompactd target
+ * watermarks in whole pageblocks. Because they can be stolen
+ * without polluting, no further fallback checks are needed.
+ */
+ if (defrag_mode && !cc->direct_compaction) {
+ if (__zone_watermark_ok(cc->zone, cc->order,
+ high_wmark_pages(cc->zone),
+ cc->highest_zoneidx, cc->alloc_flags,
+ zone_page_state(cc->zone,
+ NR_FREE_PAGES_BLOCKS)))
+ return COMPACT_SUCCESS;
+
+ return COMPACT_CONTINUE;
+ }
+
/* Direct compactor: Is a suitable page free? */
ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
struct free_area *area = &cc->zone->free_area[order];
- bool can_steal;
+ bool claim_block;
/* Job done if page is free of the right migratetype */
if (!free_area_empty(area, migratetype))
@@ -2349,7 +2365,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
+ true, &claim_block) != -1)
/*
* Movable pages are OK in any pageblock. If we are
* stealing for a non-movable allocation, make sure
@@ -2381,40 +2397,42 @@ static enum compact_result compact_finished(struct compact_control *cc)
}
static bool __compaction_suitable(struct zone *zone, int order,
- int highest_zoneidx,
- unsigned long wmark_target)
+ unsigned long watermark, int highest_zoneidx,
+ unsigned long free_pages)
{
- unsigned long watermark;
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
- * watermark and alloc_flags have to match, or be more pessimistic than
- * the check in __isolate_free_page(). We don't use the direct
- * compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's highest_zoneidx
- * to skip over zones where lowmem reserves would prevent allocation
- * even if compaction succeeds.
- * For costly orders, we require low watermark instead of min for
- * compaction to proceed to increase its chances.
+ * watermark have to match, or be more pessimistic than the check in
+ * __isolate_free_page().
+ *
+ * For costly orders, we require a higher watermark for compaction to
+ * proceed to increase its chances.
+ *
+ * We use the direct compactor's highest_zoneidx to skip over zones
+ * where lowmem reserves would prevent allocation even if compaction
+ * succeeds.
+ *
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
- * suitable migration targets
+ * suitable migration targets.
*/
- watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
- low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ watermark += low_wmark_pages(zone) - min_wmark_pages(zone);
return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target);
+ ALLOC_CMA, free_pages);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
*/
-bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, unsigned long watermark,
+ int highest_zoneidx)
{
enum compact_result compact_result;
bool suitable;
- suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ suitable = __compaction_suitable(zone, order, watermark, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2452,6 +2470,7 @@ bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
return suitable;
}
+/* Used by direct reclaimers */
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
int alloc_flags)
{
@@ -2474,8 +2493,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
- if (__compaction_suitable(zone, order, ac->highest_zoneidx,
- available))
+ if (__compaction_suitable(zone, order, min_wmark_pages(zone),
+ ac->highest_zoneidx, available))
return true;
}
@@ -2492,13 +2511,19 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
static enum compact_result
compaction_suit_allocation_order(struct zone *zone, unsigned int order,
int highest_zoneidx, unsigned int alloc_flags,
- bool async)
+ bool async, bool kcompactd)
{
+ unsigned long free_pages;
unsigned long watermark;
+ if (kcompactd && defrag_mode)
+ free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ else
+ free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
- if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
- alloc_flags))
+ if (__zone_watermark_ok(zone, order, watermark, highest_zoneidx,
+ alloc_flags, free_pages))
return COMPACT_SUCCESS;
/*
@@ -2512,13 +2537,13 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order,
*/
if (order > PAGE_ALLOC_COSTLY_ORDER && async &&
!(alloc_flags & ALLOC_CMA)) {
- watermark = low_wmark_pages(zone) + compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- 0, zone_page_state(zone, NR_FREE_PAGES)))
+ if (!__zone_watermark_ok(zone, 0, watermark + compact_gap(order),
+ highest_zoneidx, 0,
+ zone_page_state(zone, NR_FREE_PAGES)))
return COMPACT_SKIPPED;
}
- if (!compaction_suitable(zone, order, highest_zoneidx))
+ if (!compaction_suitable(zone, order, watermark, highest_zoneidx))
return COMPACT_SKIPPED;
return COMPACT_CONTINUE;
@@ -2554,7 +2579,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
ret = compaction_suit_allocation_order(cc->zone, cc->order,
cc->highest_zoneidx,
cc->alloc_flags,
- cc->mode == MIGRATE_ASYNC);
+ cc->mode == MIGRATE_ASYNC,
+ !cc->direct_compaction);
if (ret != COMPACT_CONTINUE)
return ret;
}
@@ -3048,6 +3074,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
struct zone *zone;
enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
enum compact_result ret;
+ unsigned int alloc_flags = defrag_mode ?
+ ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN;
for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
@@ -3057,8 +3085,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
ret = compaction_suit_allocation_order(zone,
pgdat->kcompactd_max_order,
- highest_zoneidx, ALLOC_WMARK_MIN,
- false);
+ highest_zoneidx, alloc_flags,
+ false, true);
if (ret == COMPACT_CONTINUE)
return true;
}
@@ -3081,6 +3109,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
+ .alloc_flags = defrag_mode ? ALLOC_WMARK_HIGH : ALLOC_WMARK_MIN,
};
enum compact_result ret;
@@ -3099,8 +3128,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
continue;
ret = compaction_suit_allocation_order(zone,
- cc.order, zoneid, ALLOC_WMARK_MIN,
- false);
+ cc.order, zoneid, cc.alloc_flags,
+ false, true);
if (ret != COMPACT_CONTINUE)
continue;
@@ -3181,6 +3210,7 @@ static int kcompactd(void *p)
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
+ current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@@ -3237,6 +3267,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
+ current->flags &= ~PF_KCOMPACTD;
+
return 0;
}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index c7b981308862..f0c1676f0599 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -76,14 +76,13 @@ int damon_register_ops(struct damon_operations *ops)
if (ops->id >= NR_DAMON_OPS)
return -EINVAL;
+
mutex_lock(&damon_ops_lock);
/* Fail for already registered ops */
- if (__damon_is_registered_ops(ops->id)) {
+ if (__damon_is_registered_ops(ops->id))
err = -EINVAL;
- goto out;
- }
- damon_registered_ops[ops->id] = *ops;
-out:
+ else
+ damon_registered_ops[ops->id] = *ops;
mutex_unlock(&damon_ops_lock);
return err;
}
@@ -281,9 +280,31 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
return filter;
}
+/**
+ * damos_filter_for_ops() - Return if the filter is ops-hndled one.
+ * @type: type of the filter.
+ *
+ * Return: true if the filter of @type needs to be handled by ops layer, false
+ * otherwise.
+ */
+bool damos_filter_for_ops(enum damos_filter_type type)
+{
+ switch (type) {
+ case DAMOS_FILTER_TYPE_ADDR:
+ case DAMOS_FILTER_TYPE_TARGET:
+ return false;
+ default:
+ break;
+ }
+ return true;
+}
+
void damos_add_filter(struct damos *s, struct damos_filter *f)
{
- list_add_tail(&f->list, &s->filters);
+ if (damos_filter_for_ops(f->type))
+ list_add_tail(&f->list, &s->ops_filters);
+ else
+ list_add_tail(&f->list, &s->filters);
}
static void damos_del_filter(struct damos_filter *f)
@@ -373,7 +394,9 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
* or damon_attrs are updated.
*/
scheme->next_apply_sis = 0;
+ scheme->walk_completed = false;
INIT_LIST_HEAD(&scheme->filters);
+ INIT_LIST_HEAD(&scheme->ops_filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -501,7 +524,7 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
ctx->passed_sample_intervals = 0;
- /* These will be set from kdamond_init_intervals_sis() */
+ /* These will be set from kdamond_init_ctx() */
ctx->next_aggregation_sis = 0;
ctx->next_ops_update_sis = 0;
@@ -579,11 +602,25 @@ static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
}
static void damon_update_monitoring_result(struct damon_region *r,
- struct damon_attrs *old_attrs, struct damon_attrs *new_attrs)
+ struct damon_attrs *old_attrs, struct damon_attrs *new_attrs,
+ bool aggregating)
{
- r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
- old_attrs, new_attrs);
- r->nr_accesses_bp = r->nr_accesses * 10000;
+ if (!aggregating) {
+ r->nr_accesses = damon_nr_accesses_for_new_attrs(
+ r->nr_accesses, old_attrs, new_attrs);
+ r->nr_accesses_bp = r->nr_accesses * 10000;
+ } else {
+ /*
+ * if this is called in the middle of the aggregation, reset
+ * the aggregations we made so far for this aggregation
+ * interval. In other words, make the status like
+ * kdamond_reset_aggregated() is called.
+ */
+ r->last_nr_accesses = damon_nr_accesses_for_new_attrs(
+ r->last_nr_accesses, old_attrs, new_attrs);
+ r->nr_accesses_bp = r->last_nr_accesses * 10000;
+ r->nr_accesses = 0;
+ }
r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
}
@@ -596,7 +633,7 @@ static void damon_update_monitoring_result(struct damon_region *r,
* ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs.
*/
static void damon_update_monitoring_results(struct damon_ctx *ctx,
- struct damon_attrs *new_attrs)
+ struct damon_attrs *new_attrs, bool aggregating)
{
struct damon_attrs *old_attrs = &ctx->attrs;
struct damon_target *t;
@@ -611,7 +648,26 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
damon_for_each_target(t, ctx)
damon_for_each_region(r, t)
damon_update_monitoring_result(
- r, old_attrs, new_attrs);
+ r, old_attrs, new_attrs, aggregating);
+}
+
+/*
+ * damon_valid_intervals_goal() - return if the intervals goal of @attrs is
+ * valid.
+ */
+static bool damon_valid_intervals_goal(struct damon_attrs *attrs)
+{
+ struct damon_intervals_goal *goal = &attrs->intervals_goal;
+
+ /* tuning is disabled */
+ if (!goal->aggrs)
+ return true;
+ if (goal->min_sample_us > goal->max_sample_us)
+ return false;
+ if (attrs->sample_interval < goal->min_sample_us ||
+ goal->max_sample_us < attrs->sample_interval)
+ return false;
+ return true;
}
/**
@@ -619,10 +675,10 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx,
* @ctx: monitoring context
* @attrs: monitoring attributes
*
- * This function should be called while the kdamond is not running, or an
- * access check results aggregation is not ongoing (e.g., from
- * &struct damon_callback->after_aggregation or
- * &struct damon_callback->after_wmarks_check callbacks).
+ * This function should be called while the kdamond is not running, an access
+ * check results aggregation is not ongoing (e.g., from &struct
+ * damon_callback->after_aggregation or &struct
+ * damon_callback->after_wmarks_check callbacks), or from damon_call().
*
* Every time interval is in micro-seconds.
*
@@ -633,6 +689,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
unsigned long sample_interval = attrs->sample_interval ?
attrs->sample_interval : 1;
struct damos *s;
+ bool aggregating = ctx->passed_sample_intervals <
+ ctx->next_aggregation_sis;
+
+ if (!damon_valid_intervals_goal(attrs))
+ return -EINVAL;
if (attrs->min_nr_regions < 3)
return -EINVAL;
@@ -641,12 +702,16 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
if (attrs->sample_interval > attrs->aggr_interval)
return -EINVAL;
+ /* calls from core-external doesn't set this. */
+ if (!attrs->aggr_samples)
+ attrs->aggr_samples = attrs->aggr_interval / sample_interval;
+
ctx->next_aggregation_sis = ctx->passed_sample_intervals +
attrs->aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->passed_sample_intervals +
attrs->ops_update_interval / sample_interval;
- damon_update_monitoring_results(ctx, attrs);
+ damon_update_monitoring_results(ctx, attrs, aggregating);
ctx->attrs = *attrs;
damon_for_each_scheme(s, ctx)
@@ -776,6 +841,9 @@ static void damos_commit_filter_arg(
case DAMOS_FILTER_TYPE_TARGET:
dst->target_idx = src->target_idx;
break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ dst->sz_range = src->sz_range;
+ break;
default:
break;
}
@@ -789,7 +857,7 @@ static void damos_commit_filter(
damos_commit_filter_arg(dst, src);
}
-static int damos_commit_filters(struct damos *dst, struct damos *src)
+static int damos_commit_core_filters(struct damos *dst, struct damos *src)
{
struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
int i = 0, j = 0;
@@ -817,6 +885,74 @@ static int damos_commit_filters(struct damos *dst, struct damos *src)
return 0;
}
+static int damos_commit_ops_filters(struct damos *dst, struct damos *src)
+{
+ struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
+ int i = 0, j = 0;
+
+ damos_for_each_ops_filter_safe(dst_filter, next, dst) {
+ src_filter = damos_nth_filter(i++, src);
+ if (src_filter)
+ damos_commit_filter(dst_filter, src_filter);
+ else
+ damos_destroy_filter(dst_filter);
+ }
+
+ damos_for_each_ops_filter_safe(src_filter, next, src) {
+ if (j++ < i)
+ continue;
+
+ new_filter = damos_new_filter(
+ src_filter->type, src_filter->matching,
+ src_filter->allow);
+ if (!new_filter)
+ return -ENOMEM;
+ damos_commit_filter_arg(new_filter, src_filter);
+ damos_add_filter(dst, new_filter);
+ }
+ return 0;
+}
+
+/**
+ * damos_filters_default_reject() - decide whether to reject memory that didn't
+ * match with any given filter.
+ * @filters: Given DAMOS filters of a group.
+ */
+static bool damos_filters_default_reject(struct list_head *filters)
+{
+ struct damos_filter *last_filter;
+
+ if (list_empty(filters))
+ return false;
+ last_filter = list_last_entry(filters, struct damos_filter, list);
+ return last_filter->allow;
+}
+
+static void damos_set_filters_default_reject(struct damos *s)
+{
+ if (!list_empty(&s->ops_filters))
+ s->core_filters_default_reject = false;
+ else
+ s->core_filters_default_reject =
+ damos_filters_default_reject(&s->filters);
+ s->ops_filters_default_reject =
+ damos_filters_default_reject(&s->ops_filters);
+}
+
+static int damos_commit_filters(struct damos *dst, struct damos *src)
+{
+ int err;
+
+ err = damos_commit_core_filters(dst, src);
+ if (err)
+ return err;
+ err = damos_commit_ops_filters(dst, src);
+ if (err)
+ return err;
+ damos_set_filters_default_reject(dst);
+ return 0;
+}
+
static struct damos *damon_nth_scheme(int n, struct damon_ctx *ctx)
{
struct damos *s;
@@ -1275,6 +1411,65 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
}
}
+static unsigned long damon_get_intervals_score(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned long sz_region, max_access_events = 0, access_events = 0;
+ unsigned long target_access_events;
+ unsigned long goal_bp = c->attrs.intervals_goal.access_bp;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ sz_region = damon_sz_region(r);
+ max_access_events += sz_region * c->attrs.aggr_samples;
+ access_events += sz_region * r->nr_accesses;
+ }
+ }
+ target_access_events = max_access_events * goal_bp / 10000;
+ return access_events * 10000 / target_access_events;
+}
+
+static unsigned long damon_feed_loop_next_input(unsigned long last_input,
+ unsigned long score);
+
+static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c)
+{
+ unsigned long score_bp, adaptation_bp;
+
+ score_bp = damon_get_intervals_score(c);
+ adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) /
+ 10000;
+ /*
+ * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of
+ * the intervals by rescaling [1,10,000] to [5000, 10,000].
+ */
+ if (adaptation_bp <= 10000)
+ adaptation_bp = 5000 + adaptation_bp / 2;
+ return adaptation_bp;
+}
+
+static void kdamond_tune_intervals(struct damon_ctx *c)
+{
+ unsigned long adaptation_bp;
+ struct damon_attrs new_attrs;
+ struct damon_intervals_goal *goal;
+
+ adaptation_bp = damon_get_intervals_adaptation_bp(c);
+ if (adaptation_bp == 10000)
+ return;
+
+ new_attrs = c->attrs;
+ goal = &c->attrs.intervals_goal;
+ new_attrs.sample_interval = min(goal->max_sample_us,
+ c->attrs.sample_interval * adaptation_bp / 10000);
+ new_attrs.sample_interval = max(goal->min_sample_us,
+ new_attrs.sample_interval);
+ new_attrs.aggr_interval = new_attrs.sample_interval *
+ c->attrs.aggr_samples;
+ damon_set_attrs(c, &new_attrs);
+}
+
static void damon_split_region_at(struct damon_target *t,
struct damon_region *r, unsigned long sz_r);
@@ -1429,11 +1624,15 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_filter *filter;
+ s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter))
+ if (damos_filter_match(ctx, t, r, filter)) {
+ if (filter->allow)
+ s->core_filters_allowed = true;
return !filter->allow;
+ }
}
- return false;
+ return s->core_filters_default_reject;
}
/*
@@ -1453,11 +1652,13 @@ static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_walk_control *control;
- mutex_lock(&ctx->walk_control_lock);
+ if (s->walk_completed)
+ return;
+
control = ctx->walk_control;
- mutex_unlock(&ctx->walk_control_lock);
if (!control)
return;
+
control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed);
}
@@ -1477,9 +1678,7 @@ static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s)
struct damos *siter;
struct damos_walk_control *control;
- mutex_lock(&ctx->walk_control_lock);
control = ctx->walk_control;
- mutex_unlock(&ctx->walk_control_lock);
if (!control)
return;
@@ -1489,10 +1688,11 @@ static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s)
if (!siter->walk_completed)
return;
}
+ damon_for_each_scheme(siter, ctx)
+ siter->walk_completed = false;
+
complete(&control->completion);
- mutex_lock(&ctx->walk_control_lock);
ctx->walk_control = NULL;
- mutex_unlock(&ctx->walk_control_lock);
}
/*
@@ -1530,7 +1730,6 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct timespec64 begin, end;
unsigned long sz_applied = 0;
unsigned long sz_ops_filter_passed = 0;
- int err = 0;
/*
* We plan to support multiple context per kdamond, as DAMON sysfs
* implies with 'nr_contexts' file. Nevertheless, only single context
@@ -1570,14 +1769,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (damos_filter_out(c, t, r, s))
return;
ktime_get_coarse_ts64(&begin);
- if (c->callback.before_damos_apply)
- err = c->callback.before_damos_apply(c, t, r, s);
- if (!err) {
- trace_damos_before_apply(cidx, sidx, tidx, r,
- damon_nr_regions(t), do_trace);
- sz_applied = c->ops.apply_scheme(c, t, r, s,
- &sz_ops_filter_passed);
- }
+ trace_damos_before_apply(cidx, sidx, tidx, r,
+ damon_nr_regions(t), do_trace);
+ sz_applied = c->ops.apply_scheme(c, t, r, s,
+ &sz_ops_filter_passed);
damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
@@ -1839,6 +2034,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (!has_schemes_to_apply)
return;
+ mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
@@ -1851,7 +2047,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
s->next_apply_sis = c->passed_sample_intervals +
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
+ s->last_applied = NULL;
}
+ mutex_unlock(&c->walk_control_lock);
}
/*
@@ -2164,7 +2362,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
return -EBUSY;
}
-static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+static void kdamond_init_ctx(struct damon_ctx *ctx)
{
unsigned long sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
@@ -2175,11 +2373,14 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
sample_interval;
+ ctx->next_intervals_tune_sis = ctx->next_aggregation_sis *
+ ctx->attrs.intervals_goal.aggrs;
damon_for_each_scheme(scheme, ctx) {
apply_interval = scheme->apply_interval_us ?
scheme->apply_interval_us : ctx->attrs.aggr_interval;
scheme->next_apply_sis = apply_interval / sample_interval;
+ damos_set_filters_default_reject(scheme);
}
}
@@ -2197,12 +2398,10 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
complete(&ctx->kdamond_started);
- kdamond_init_intervals_sis(ctx);
+ kdamond_init_ctx(ctx);
if (ctx->ops.init)
ctx->ops.init(ctx);
- if (ctx->callback.before_start && ctx->callback.before_start(ctx))
- goto done;
ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1,
sizeof(*ctx->regions_score_histogram), GFP_KERNEL);
if (!ctx->regions_score_histogram)
@@ -2227,10 +2426,6 @@ static int kdamond_fn(void *data)
if (ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
- if (ctx->callback.after_sampling &&
- ctx->callback.after_sampling(ctx))
- break;
- kdamond_call(ctx, false);
kdamond_usleep(sample_interval);
ctx->passed_sample_intervals++;
@@ -2248,9 +2443,10 @@ static int kdamond_fn(void *data)
}
/*
- * do kdamond_apply_schemes() after kdamond_merge_regions() if
- * possible, to reduce overhead
+ * do kdamond_call() and kdamond_apply_schemes() after
+ * kdamond_merge_regions() if possible, to reduce overhead
*/
+ kdamond_call(ctx, false);
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
else
@@ -2259,13 +2455,40 @@ static int kdamond_fn(void *data)
sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+ if (ctx->attrs.intervals_goal.aggrs &&
+ ctx->passed_sample_intervals >=
+ ctx->next_intervals_tune_sis) {
+ /*
+ * ctx->next_aggregation_sis might be updated
+ * from kdamond_call(). In the case,
+ * damon_set_attrs() which will be called from
+ * kdamond_tune_interval() may wrongly think
+ * this is in the middle of the current
+ * aggregation, and make aggregation
+ * information reset for all regions. Then,
+ * following kdamond_reset_aggregated() call
+ * will make the region information invalid,
+ * particularly for ->nr_accesses_bp.
+ *
+ * Reset ->next_aggregation_sis to avoid that.
+ * It will anyway correctly updated after this
+ * if caluse.
+ */
+ ctx->next_aggregation_sis =
+ next_aggregation_sis;
+ ctx->next_intervals_tune_sis +=
+ ctx->attrs.aggr_samples *
+ ctx->attrs.intervals_goal.aggrs;
+ kdamond_tune_intervals(ctx);
+ sample_interval = ctx->attrs.sample_interval ?
+ ctx->attrs.sample_interval : 1;
+
+ }
ctx->next_aggregation_sis = next_aggregation_sis +
ctx->attrs.aggr_interval / sample_interval;
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
- if (ctx->ops.reset_aggregated)
- ctx->ops.reset_aggregated(ctx);
}
if (ctx->passed_sample_intervals >= next_ops_update_sis) {
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index d25d99cb5f2b..0db1fc70c84d 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -9,6 +9,8 @@
#include <linux/page_idle.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include "ops-common.h"
@@ -24,7 +26,7 @@ struct folio *damon_get_folio(unsigned long pfn)
struct page *page = pfn_to_online_page(pfn);
struct folio *folio;
- if (!page || PageTail(page))
+ if (!page)
return NULL;
folio = page_folio(page);
@@ -39,12 +41,29 @@ struct folio *damon_get_folio(unsigned long pfn)
void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
{
- struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte)));
+ pte_t pteval = ptep_get(pte);
+ struct folio *folio;
+ bool young = false;
+ unsigned long pfn;
+
+ if (likely(pte_present(pteval)))
+ pfn = pte_pfn(pteval);
+ else
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ folio = damon_get_folio(pfn);
if (!folio)
return;
- if (ptep_clear_young_notify(vma, addr, pte))
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that actually map pages
+ * are "old" from a CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (likely(pte_present(pteval)))
+ young |= ptep_test_and_clear_young(vma, addr, pte);
+ young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
+ if (young)
folio_set_young(folio);
folio_set_idle(folio);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 0f9ae14f884d..1b70d3f36046 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -92,12 +92,20 @@ static bool damon_folio_young_one(struct folio *folio,
{
bool *accessed = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
+ pte_t pte;
*accessed = false;
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- *accessed = pte_young(ptep_get(pvmw.pte)) ||
+ pte = ptep_get(pvmw.pte);
+
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
+ */
+ *accessed = (pte_present(pte) && pte_young(pte)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
@@ -203,11 +211,15 @@ static bool damos_pa_filter_match(struct damos_filter *filter,
{
bool matched = false;
struct mem_cgroup *memcg;
+ size_t folio_sz;
switch (filter->type) {
case DAMOS_FILTER_TYPE_ANON:
matched = folio_test_anon(folio);
break;
+ case DAMOS_FILTER_TYPE_ACTIVE:
+ matched = folio_test_active(folio);
+ break;
case DAMOS_FILTER_TYPE_MEMCG:
rcu_read_lock();
memcg = folio_memcg_check(folio);
@@ -222,6 +234,14 @@ static bool damos_pa_filter_match(struct damos_filter *filter,
if (matched)
damon_folio_mkold(folio);
break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ folio_sz = folio_size(folio);
+ matched = filter->sz_range.min <= folio_sz &&
+ folio_sz <= filter->sz_range.max;
+ break;
+ case DAMOS_FILTER_TYPE_UNMAPPED:
+ matched = !folio_mapped(folio) || !folio_raw_mapping(folio);
+ break;
default:
break;
}
@@ -236,10 +256,24 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
{
struct damos_filter *filter;
- damos_for_each_filter(filter, scheme) {
+ if (scheme->core_filters_allowed)
+ return false;
+
+ damos_for_each_ops_filter(filter, scheme) {
if (damos_pa_filter_match(filter, folio))
return !filter->allow;
}
+ return scheme->ops_filters_default_reject;
+}
+
+static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s)
+{
+ if (!folio)
+ return true;
+ if (folio == s->last_applied) {
+ folio_put(folio);
+ return true;
+ }
return false;
}
@@ -250,9 +284,10 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
+ struct folio *folio;
/* check access in page level again by default */
- damos_for_each_filter(filter, s) {
+ damos_for_each_ops_filter(filter, s) {
if (filter->type == DAMOS_FILTER_TYPE_YOUNG) {
install_young_filter = false;
break;
@@ -266,11 +301,13 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
damos_add_filter(s, filter);
}
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
@@ -286,12 +323,14 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
else
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
if (install_young_filter)
damos_destroy_filter(filter);
applied = reclaim_pages(&folio_list);
cond_resched();
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
@@ -300,12 +339,15 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
unsigned long *sz_filter_passed)
{
unsigned long addr, applied = 0;
+ struct folio *folio;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
@@ -318,8 +360,10 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
folio_deactivate(folio);
applied += folio_nr_pages(folio);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
@@ -463,12 +507,15 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
{
unsigned long addr, applied;
LIST_HEAD(folio_list);
+ struct folio *folio;
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio)
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
+ addr += PAGE_SIZE;
continue;
+ }
if (damos_pa_filter_out(s, folio))
goto put_folio;
@@ -479,10 +526,12 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
goto put_folio;
list_add(&folio->lru, &folio_list);
put_folio:
+ addr += folio_size(folio);
folio_put(folio);
}
applied = damon_pa_migrate_pages(&folio_list, s->target_nid);
cond_resched();
+ s->last_applied = folio;
return applied * PAGE_SIZE;
}
@@ -490,7 +539,7 @@ static bool damon_pa_scheme_has_filter(struct damos *s)
{
struct damos_filter *f;
- damos_for_each_filter(f, s)
+ damos_for_each_ops_filter(f, s)
return true;
return false;
}
@@ -500,15 +549,15 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
{
unsigned long addr;
LIST_HEAD(folio_list);
+ struct folio *folio;
if (!damon_pa_scheme_has_filter(s))
return 0;
addr = r->ar.start;
while (addr < r->ar.end) {
- struct folio *folio = damon_get_folio(PHYS_PFN(addr));
-
- if (!folio) {
+ folio = damon_get_folio(PHYS_PFN(addr));
+ if (damon_pa_invalid_damos_folio(folio, s)) {
addr += PAGE_SIZE;
continue;
}
@@ -518,6 +567,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
addr += folio_size(folio);
folio_put(folio);
}
+ s->last_applied = folio;
return 0;
}
@@ -574,7 +624,6 @@ static int __init damon_pa_initcall(void)
.update = NULL,
.prepare_access_checks = damon_pa_prepare_access_checks,
.check_accesses = damon_pa_check_accesses,
- .reset_aggregated = NULL,
.target_valid = NULL,
.cleanup = NULL,
.apply_scheme = damon_pa_apply_scheme,
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 98f93ae9f59e..23b562df0839 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -309,26 +309,46 @@ static const struct kobj_type damon_sysfs_stats_ktype = {
* filter directory
*/
+/*
+ * enum damos_sysfs_filter_handle_layer - Layers handling filters of a dir.
+ */
+enum damos_sysfs_filter_handle_layer {
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH,
+};
+
struct damon_sysfs_scheme_filter {
struct kobject kobj;
+ enum damos_sysfs_filter_handle_layer handle_layer;
enum damos_filter_type type;
bool matching;
bool allow;
char *memcg_path;
struct damon_addr_range addr_range;
+ struct damon_size_range sz_range;
int target_idx;
};
-static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(
+ enum damos_sysfs_filter_handle_layer layer)
{
- return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+ struct damon_sysfs_scheme_filter *filter;
+
+ filter = kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+ if (filter)
+ filter->handle_layer = layer;
+ return filter;
}
/* Should match with enum damos_filter_type */
static const char * const damon_sysfs_scheme_filter_type_strs[] = {
"anon",
+ "active",
"memcg",
"young",
+ "hugepage_size",
+ "unmapped",
"addr",
"target",
};
@@ -343,6 +363,23 @@ static ssize_t type_show(struct kobject *kobj,
damon_sysfs_scheme_filter_type_strs[filter->type]);
}
+static bool damos_sysfs_scheme_filter_valid_type(
+ enum damos_sysfs_filter_handle_layer layer,
+ enum damos_filter_type type)
+{
+ switch (layer) {
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH:
+ return true;
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE:
+ return !damos_filter_for_ops(type);
+ case DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS:
+ return damos_filter_for_ops(type);
+ default:
+ break;
+ }
+ return false;
+}
+
static ssize_t type_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
@@ -354,6 +391,9 @@ static ssize_t type_store(struct kobject *kobj,
for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
type])) {
+ if (!damos_sysfs_scheme_filter_valid_type(
+ filter->handle_layer, type))
+ break;
filter->type = type;
ret = count;
break;
@@ -473,6 +513,44 @@ static ssize_t addr_end_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t min_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->sz_range.min);
+}
+
+static ssize_t min_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->sz_range.min);
+
+ return err ? err : count;
+}
+
+static ssize_t max_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%lu\n", filter->sz_range.max);
+}
+
+static ssize_t max_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ int err = kstrtoul(buf, 0, &filter->sz_range.max);
+
+ return err ? err : count;
+}
+
static ssize_t damon_target_idx_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -519,6 +597,12 @@ static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr =
__ATTR_RW_MODE(addr_end, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr =
__ATTR_RW_MODE(damon_target_idx, 0600);
@@ -529,6 +613,8 @@ static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
&damon_sysfs_scheme_filter_memcg_path_attr.attr,
&damon_sysfs_scheme_filter_addr_start_attr.attr,
&damon_sysfs_scheme_filter_addr_end_attr.attr,
+ &damon_sysfs_scheme_filter_min_attr.attr,
+ &damon_sysfs_scheme_filter_max_attr.attr,
&damon_sysfs_scheme_filter_damon_target_idx_attr.attr,
NULL,
};
@@ -546,14 +632,20 @@ static const struct kobj_type damon_sysfs_scheme_filter_ktype = {
struct damon_sysfs_scheme_filters {
struct kobject kobj;
+ enum damos_sysfs_filter_handle_layer handle_layer;
struct damon_sysfs_scheme_filter **filters_arr;
int nr;
};
static struct damon_sysfs_scheme_filters *
-damon_sysfs_scheme_filters_alloc(void)
+damon_sysfs_scheme_filters_alloc(enum damos_sysfs_filter_handle_layer layer)
{
- return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+ struct damon_sysfs_scheme_filters *filters;
+
+ filters = kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+ if (filters)
+ filters->handle_layer = layer;
+ return filters;
}
static void damon_sysfs_scheme_filters_rm_dirs(
@@ -586,7 +678,8 @@ static int damon_sysfs_scheme_filters_add_dirs(
filters->filters_arr = filters_arr;
for (i = 0; i < nr_filters; i++) {
- filter = damon_sysfs_scheme_filter_alloc();
+ filter = damon_sysfs_scheme_filter_alloc(
+ filters->handle_layer);
if (!filter) {
damon_sysfs_scheme_filters_rm_dirs(filters);
return -ENOMEM;
@@ -1379,7 +1472,7 @@ static int damon_sysfs_access_pattern_add_range_dir(
if (!range)
return -ENOMEM;
err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
- &access_pattern->kobj, name);
+ &access_pattern->kobj, "%s", name);
if (err)
kobject_put(&range->kobj);
else
@@ -1455,6 +1548,8 @@ struct damon_sysfs_scheme {
unsigned long apply_interval_us;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *core_filters;
+ struct damon_sysfs_scheme_filters *ops_filters;
struct damon_sysfs_scheme_filters *filters;
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
@@ -1555,21 +1650,53 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
return err;
}
-static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme,
+ enum damos_sysfs_filter_handle_layer layer, const char *name,
+ struct damon_sysfs_scheme_filters **filters_ptr)
{
struct damon_sysfs_scheme_filters *filters =
- damon_sysfs_scheme_filters_alloc();
+ damon_sysfs_scheme_filters_alloc(layer);
int err;
if (!filters)
return -ENOMEM;
err = kobject_init_and_add(&filters->kobj,
&damon_sysfs_scheme_filters_ktype, &scheme->kobj,
- "filters");
+ "%s", name);
if (err)
kobject_put(&filters->kobj);
else
- scheme->filters = filters;
+ *filters_ptr = filters;
+ return err;
+}
+
+static int damos_sysfs_set_filter_dirs(struct damon_sysfs_scheme *scheme)
+{
+ int err;
+
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_BOTH, "filters",
+ &scheme->filters);
+ if (err)
+ return err;
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_CORE, "core_filters",
+ &scheme->core_filters);
+ if (err)
+ goto put_filters_out;
+ err = damon_sysfs_scheme_set_filters(scheme,
+ DAMOS_SYSFS_FILTER_HANDLE_LAYER_OPS, "ops_filters",
+ &scheme->ops_filters);
+ if (err)
+ goto put_core_filters_out;
+ return 0;
+
+put_core_filters_out:
+ kobject_put(&scheme->core_filters->kobj);
+ scheme->core_filters = NULL;
+put_filters_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
return err;
}
@@ -1621,7 +1748,7 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_filters(scheme);
+ err = damos_sysfs_set_filter_dirs(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
err = damon_sysfs_scheme_set_stats(scheme);
@@ -1636,6 +1763,10 @@ put_tried_regions_out:
kobject_put(&scheme->tried_regions->kobj);
scheme->tried_regions = NULL;
put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->ops_filters->kobj);
+ scheme->ops_filters = NULL;
+ kobject_put(&scheme->core_filters->kobj);
+ scheme->core_filters = NULL;
kobject_put(&scheme->filters->kobj);
scheme->filters = NULL;
put_watermarks_quotas_access_pattern_out:
@@ -1659,6 +1790,10 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
kobject_put(&scheme->watermarks->kobj);
damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
kobject_put(&scheme->filters->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->core_filters);
+ kobject_put(&scheme->core_filters->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->ops_filters);
+ kobject_put(&scheme->ops_filters->kobj);
kobject_put(&scheme->stats->kobj);
damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
kobject_put(&scheme->tried_regions->kobj);
@@ -1953,6 +2088,13 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme,
filter->addr_range = sysfs_filter->addr_range;
} else if (filter->type == DAMOS_FILTER_TYPE_TARGET) {
filter->target_idx = sysfs_filter->target_idx;
+ } else if (filter->type == DAMOS_FILTER_TYPE_HUGEPAGE_SIZE) {
+ if (sysfs_filter->sz_range.min >
+ sysfs_filter->sz_range.max) {
+ damos_destroy_filter(filter);
+ return -EINVAL;
+ }
+ filter->sz_range = sysfs_filter->sz_range;
}
damos_add_filter(scheme, filter);
@@ -2048,8 +2190,6 @@ static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
- struct damon_sysfs_scheme_filters *sysfs_filters =
- sysfs_scheme->filters;
struct damos *scheme;
int err;
@@ -2089,7 +2229,17 @@ static struct damos *damon_sysfs_mk_scheme(
return NULL;
}
- err = damon_sysfs_add_scheme_filters(scheme, sysfs_filters);
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->core_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->ops_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ err = damon_sysfs_add_scheme_filters(scheme, sysfs_scheme->filters);
if (err) {
damon_destroy_scheme(scheme);
return NULL;
@@ -2192,7 +2342,6 @@ void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
}
}
-/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
int damon_sysfs_schemes_clear_regions(
struct damon_sysfs_schemes *sysfs_schemes)
{
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index deeab04d3b46..1af6aff35d84 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -409,6 +409,164 @@ static const struct kobj_type damon_sysfs_targets_ktype = {
};
/*
+ * intervals goal directory
+ */
+
+struct damon_sysfs_intervals_goal {
+ struct kobject kobj;
+ unsigned long access_bp;
+ unsigned long aggrs;
+ unsigned long min_sample_us;
+ unsigned long max_sample_us;
+};
+
+static struct damon_sysfs_intervals_goal *damon_sysfs_intervals_goal_alloc(
+ unsigned long access_bp, unsigned long aggrs,
+ unsigned long min_sample_us, unsigned long max_sample_us)
+{
+ struct damon_sysfs_intervals_goal *goal = kmalloc(sizeof(*goal),
+ GFP_KERNEL);
+
+ if (!goal)
+ return NULL;
+
+ goal->kobj = (struct kobject){};
+ goal->access_bp = access_bp;
+ goal->aggrs = aggrs;
+ goal->min_sample_us = min_sample_us;
+ goal->max_sample_us = max_sample_us;
+ return goal;
+}
+
+static ssize_t access_bp_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->access_bp);
+}
+
+static ssize_t access_bp_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->access_bp = nr;
+ return count;
+}
+
+static ssize_t aggrs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->aggrs);
+}
+
+static ssize_t aggrs_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->aggrs = nr;
+ return count;
+}
+
+static ssize_t min_sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->min_sample_us);
+}
+
+static ssize_t min_sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->min_sample_us = nr;
+ return count;
+}
+
+static ssize_t max_sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+
+ return sysfs_emit(buf, "%lu\n", goal->max_sample_us);
+}
+
+static ssize_t max_sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals_goal *goal = container_of(kobj,
+ struct damon_sysfs_intervals_goal, kobj);
+ unsigned long nr;
+ int err = kstrtoul(buf, 0, &nr);
+
+ if (err)
+ return err;
+
+ goal->max_sample_us = nr;
+ return count;
+}
+
+static void damon_sysfs_intervals_goal_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_intervals_goal, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_intervals_goal_access_bp_attr =
+ __ATTR_RW_MODE(access_bp, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_aggrs_attr =
+ __ATTR_RW_MODE(aggrs, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_min_sample_us_attr =
+ __ATTR_RW_MODE(min_sample_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_goal_max_sample_us_attr =
+ __ATTR_RW_MODE(max_sample_us, 0600);
+
+static struct attribute *damon_sysfs_intervals_goal_attrs[] = {
+ &damon_sysfs_intervals_goal_access_bp_attr.attr,
+ &damon_sysfs_intervals_goal_aggrs_attr.attr,
+ &damon_sysfs_intervals_goal_min_sample_us_attr.attr,
+ &damon_sysfs_intervals_goal_max_sample_us_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_intervals_goal);
+
+static const struct kobj_type damon_sysfs_intervals_goal_ktype = {
+ .release = damon_sysfs_intervals_goal_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_intervals_goal_groups,
+};
+
+/*
* intervals directory
*/
@@ -417,6 +575,7 @@ struct damon_sysfs_intervals {
unsigned long sample_us;
unsigned long aggr_us;
unsigned long update_us;
+ struct damon_sysfs_intervals_goal *intervals_goal;
};
static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
@@ -436,6 +595,32 @@ static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
return intervals;
}
+static int damon_sysfs_intervals_add_dirs(struct damon_sysfs_intervals *intervals)
+{
+ struct damon_sysfs_intervals_goal *goal;
+ int err;
+
+ goal = damon_sysfs_intervals_goal_alloc(0, 0, 0, 0);
+ if (!goal)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&goal->kobj,
+ &damon_sysfs_intervals_goal_ktype, &intervals->kobj,
+ "intervals_goal");
+ if (err) {
+ kobject_put(&goal->kobj);
+ intervals->intervals_goal = NULL;
+ return err;
+ }
+ intervals->intervals_goal = goal;
+ return 0;
+}
+
+static void damon_sysfs_intervals_rm_dirs(struct damon_sysfs_intervals *intervals)
+{
+ kobject_put(&intervals->intervals_goal->kobj);
+}
+
static ssize_t sample_us_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -571,6 +756,9 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
"intervals");
if (err)
goto put_intervals_out;
+ err = damon_sysfs_intervals_add_dirs(intervals);
+ if (err)
+ goto put_intervals_out;
attrs->intervals = intervals;
nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
@@ -599,6 +787,7 @@ put_intervals_out:
static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
{
kobject_put(&attrs->nr_regions_range->kobj);
+ damon_sysfs_intervals_rm_dirs(attrs->intervals);
kobject_put(&attrs->intervals->kobj);
}
@@ -1025,6 +1214,11 @@ enum damon_sysfs_cmd {
*/
DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS,
/*
+ * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring
+ * intevals.
+ */
+ DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS,
+ /*
* @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
*/
NR_DAMON_SYSFS_CMDS,
@@ -1041,27 +1235,9 @@ static const char * const damon_sysfs_cmd_strs[] = {
"update_schemes_tried_regions",
"clear_schemes_tried_regions",
"update_schemes_effective_quotas",
+ "update_tuned_intervals",
};
-/*
- * struct damon_sysfs_cmd_request - A request to the DAMON callback.
- * @cmd: The command that needs to be handled by the callback.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
- *
- * This structure represents a sysfs command request that need to access some
- * DAMON context-internal data. Because DAMON context-internal data can be
- * safely accessed from DAMON callbacks without additional synchronization, the
- * request will be handled by the DAMON callback. None-``NULL`` @kdamond means
- * the request is valid.
- */
-struct damon_sysfs_cmd_request {
- enum damon_sysfs_cmd cmd;
- struct damon_sysfs_kdamond *kdamond;
-};
-
-/* Current DAMON callback request. Protected by damon_sysfs_lock. */
-static struct damon_sysfs_cmd_request damon_sysfs_cmd_request;
-
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -1084,11 +1260,18 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
struct damon_sysfs_attrs *sys_attrs)
{
struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
+ struct damon_sysfs_intervals_goal *sys_goal =
+ sys_intervals->intervals_goal;
struct damon_sysfs_ul_range *sys_nr_regions =
sys_attrs->nr_regions_range;
struct damon_attrs attrs = {
.sample_interval = sys_intervals->sample_us,
.aggr_interval = sys_intervals->aggr_us,
+ .intervals_goal = {
+ .access_bp = sys_goal->access_bp,
+ .aggrs = sys_goal->aggrs,
+ .min_sample_us = sys_goal->min_sample_us,
+ .max_sample_us = sys_goal->max_sample_us},
.ops_update_interval = sys_intervals->update_us,
.min_nr_regions = sys_nr_regions->min,
.max_nr_regions = sys_nr_regions->max,
@@ -1247,11 +1430,12 @@ static struct damon_ctx *damon_sysfs_build_ctx(
* damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
* @kdamond: The kobject wrapper for the associated kdamond.
*
- * If the sysfs input is wrong, the kdamond will be terminated.
+ * Returns error if the sysfs input is wrong.
*/
-static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_commit_input(void *data)
{
- struct damon_ctx *param_ctx;
+ struct damon_sysfs_kdamond *kdamond = data;
+ struct damon_ctx *param_ctx, *test_ctx;
int err;
if (!damon_sysfs_kdamond_running(kdamond))
@@ -1263,7 +1447,15 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
if (IS_ERR(param_ctx))
return PTR_ERR(param_ctx);
+ test_ctx = damon_new_ctx();
+ err = damon_commit_ctx(test_ctx, param_ctx);
+ if (err) {
+ damon_sysfs_destroy_targets(test_ctx);
+ damon_destroy_ctx(test_ctx);
+ goto out;
+ }
err = damon_commit_ctx(kdamond->damon_ctx, param_ctx);
+out:
damon_sysfs_destroy_targets(param_ctx);
damon_destroy_ctx(param_ctx);
return err;
@@ -1306,69 +1498,16 @@ static int damon_sysfs_upd_schemes_effective_quotas(void *data)
return 0;
}
-
-/*
- * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
- * @c: The DAMON context of the callback.
- * @active: Whether @c is not deactivated due to watermarks.
- * @after_aggr: Whether this is called from after_aggregation() callback.
- *
- * This function is periodically called back from the kdamond thread for @c.
- * Then, it checks if there is a waiting DAMON sysfs request and handles it.
- */
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
- bool after_aggregation)
-{
- struct damon_sysfs_kdamond *kdamond;
- int err = 0;
-
- /* avoid deadlock due to concurrent state_store('off') */
- if (!mutex_trylock(&damon_sysfs_lock))
- return 0;
- kdamond = damon_sysfs_cmd_request.kdamond;
- if (!kdamond || kdamond->damon_ctx != c)
- goto out;
- switch (damon_sysfs_cmd_request.cmd) {
- case DAMON_SYSFS_CMD_COMMIT:
- if (!after_aggregation)
- goto out;
- err = damon_sysfs_commit_input(kdamond);
- break;
- default:
- break;
- }
- /* Mark the request as invalid now. */
- damon_sysfs_cmd_request.kdamond = NULL;
-out:
- mutex_unlock(&damon_sysfs_lock);
- return err;
-}
-
-static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
+static int damon_sysfs_upd_tuned_intervals(void *data)
{
- /*
- * after_wmarks_check() is called back while the context is deactivated
- * by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, false, false);
-}
-
-static int damon_sysfs_after_sampling(struct damon_ctx *c)
-{
- /*
- * after_sampling() is called back only while the context is not
- * deactivated by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, true, false);
-}
+ struct damon_sysfs_kdamond *kdamond = data;
+ struct damon_ctx *ctx = kdamond->damon_ctx;
-static int damon_sysfs_after_aggregation(struct damon_ctx *c)
-{
- /*
- * after_aggregation() is called back only while the context is not
- * deactivated by watermarks.
- */
- return damon_sysfs_cmd_request_callback(c, true, true);
+ kdamond->contexts->contexts_arr[0]->attrs->intervals->sample_us =
+ ctx->attrs.sample_interval;
+ kdamond->contexts->contexts_arr[0]->attrs->intervals->aggr_us =
+ ctx->attrs.aggr_interval;
+ return 0;
}
static struct damon_ctx *damon_sysfs_build_ctx(
@@ -1386,9 +1525,6 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ERR_PTR(err);
}
- ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
- ctx->callback.after_sampling = damon_sysfs_after_sampling;
- ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
}
@@ -1400,8 +1536,6 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
if (damon_sysfs_kdamond_running(kdamond))
return -EBUSY;
- if (damon_sysfs_cmd_request.kdamond == kdamond)
- return -EBUSY;
/* TODO: support multiple contexts per kdamond */
if (kdamond->contexts->nr != 1)
return -EINVAL;
@@ -1491,24 +1625,21 @@ static int damon_sysfs_update_schemes_tried_regions(
* @cmd: The command to handle.
* @kdamond: The kobject wrapper for the associated kdamond.
*
- * This function handles a DAMON sysfs command for a kdamond. For commands
- * that need to access running DAMON context-internal data, it requests
- * handling of the command to the DAMON callback
- * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled,
- * or the context is completed.
+ * This function handles a DAMON sysfs command for a kdamond.
*
* Return: 0 on success, negative error code otherwise.
*/
static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
struct damon_sysfs_kdamond *kdamond)
{
- bool need_wait = true;
-
switch (cmd) {
case DAMON_SYSFS_CMD_ON:
return damon_sysfs_turn_damon_on(kdamond);
case DAMON_SYSFS_CMD_OFF:
return damon_sysfs_turn_damon_off(kdamond);
+ case DAMON_SYSFS_CMD_COMMIT:
+ return damon_sysfs_damon_call(
+ damon_sysfs_commit_input, kdamond);
case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
return damon_sysfs_damon_call(
damon_sysfs_commit_schemes_quota_goals,
@@ -1527,39 +1658,12 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
return damon_sysfs_damon_call(
damon_sysfs_upd_schemes_effective_quotas,
kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_tuned_intervals, kdamond);
default:
- break;
- }
-
- /* Pass the command to DAMON callback for safe DAMON context access */
- if (damon_sysfs_cmd_request.kdamond)
- return -EBUSY;
- if (!damon_sysfs_kdamond_running(kdamond))
return -EINVAL;
- damon_sysfs_cmd_request.cmd = cmd;
- damon_sysfs_cmd_request.kdamond = kdamond;
-
- /*
- * wait until damon_sysfs_cmd_request_callback() handles the request
- * from kdamond context
- */
- mutex_unlock(&damon_sysfs_lock);
- while (need_wait) {
- schedule_timeout_idle(msecs_to_jiffies(100));
- if (!mutex_trylock(&damon_sysfs_lock))
- continue;
- if (!damon_sysfs_cmd_request.kdamond) {
- /* damon_sysfs_cmd_request_callback() handled */
- need_wait = false;
- } else if (!damon_sysfs_kdamond_running(kdamond)) {
- /* kdamond has already finished */
- need_wait = false;
- damon_sysfs_cmd_request.kdamond = NULL;
- }
- mutex_unlock(&damon_sysfs_lock);
}
- mutex_lock(&damon_sysfs_lock);
- return 0;
}
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1671,8 +1775,7 @@ static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
int i;
for (i = 0; i < nr_kdamonds; i++) {
- if (damon_sysfs_kdamond_running(kdamonds[i]) ||
- damon_sysfs_cmd_request.kdamond == kdamonds[i])
+ if (damon_sysfs_kdamond_running(kdamonds[i]))
return true;
}
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 532c6a6f21f9..be0fea9ee5fc 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -348,19 +348,19 @@ static void damon_test_update_monitoring_result(struct kunit *test)
new_attrs = (struct damon_attrs){
.sample_interval = 100, .aggr_interval = 10000,};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 15);
KUNIT_EXPECT_EQ(test, r->age, 2);
new_attrs = (struct damon_attrs){
.sample_interval = 1, .aggr_interval = 1000};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 2);
new_attrs = (struct damon_attrs){
.sample_interval = 1, .aggr_interval = 100};
- damon_update_monitoring_result(r, &old_attrs, &new_attrs);
+ damon_update_monitoring_result(r, &old_attrs, &new_attrs, false);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 150);
KUNIT_EXPECT_EQ(test, r->age, 20);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a6174f725bd7..e6d99106a7f9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -710,7 +710,6 @@ static int __init damon_va_initcall(void)
.update = damon_va_update,
.prepare_access_checks = damon_va_prepare_access_checks,
.check_accesses = damon_va_check_accesses,
- .reset_aggregated = NULL,
.target_valid = damon_va_target_valid,
.cleanup = NULL,
.apply_scheme = damon_va_apply_scheme,
diff --git a/mm/debug.c b/mm/debug.c
index 8d2acf432385..db83e381a8ae 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -79,12 +79,17 @@ static void __dump_folio(struct folio *folio, struct page *page,
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
if (folio_test_large(folio)) {
+ int pincount = 0;
+
+ if (folio_has_pincount(folio))
+ pincount = atomic_read(&folio->_pincount);
+
pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
folio_order(folio),
folio_mapcount(folio),
folio_entire_mapcount(folio),
folio_nr_pages_mapped(folio),
- atomic_read(&folio->_pincount));
+ pincount);
}
#ifdef CONFIG_MEMCG
@@ -146,6 +151,9 @@ again:
if (idx < MAX_FOLIO_NR_PAGES) {
memcpy(&folio, foliop, 2 * sizeof(struct page));
nr_pages = folio_nr_pages(&folio);
+ if (nr_pages > 1)
+ memcpy(&folio.__page_2, &foliop->__page_2,
+ sizeof(struct page));
foliop = &folio;
}
@@ -165,7 +173,7 @@ dump:
void dump_page(const struct page *page, const char *reason)
{
if (PagePoisoned(page))
- pr_warn("page:%p is uninitialized and poisoned", page);
+ pr_warn("page:%p is uninitialized and poisoned\n", page);
else
__dump_page(page);
if (reason)
@@ -181,11 +189,17 @@ void dump_vma(const struct vm_area_struct *vma)
pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
+#ifdef CONFIG_PER_VMA_LOCK
+ "refcnt %x\n"
+#endif
"flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
+#ifdef CONFIG_PER_VMA_LOCK
+ refcount_read(&vma->vm_refcnt),
+#endif
vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -261,16 +275,19 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
pr_warn("vmg %px state: mm %px pgoff %lx\n"
"vmi %px [%lx,%lx)\n"
- "prev %px next %px vma %px\n"
+ "prev %px middle %px next %px target %px\n"
"start %lx end %lx flags %lx\n"
"file %px anon_vma %px policy %px\n"
"uffd_ctx %px\n"
"anon_name %px\n"
- "merge_flags %x state %x\n",
+ "state %x\n"
+ "just_expand %d\n"
+ "__adjust_middle_start %d __adjust_next_start %d\n"
+ "__remove_middle %d __remove_next %d\n",
vmg, vmg->mm, vmg->pgoff,
vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
- vmg->prev, vmg->next, vmg->vma,
+ vmg->prev, vmg->middle, vmg->next, vmg->target,
vmg->start, vmg->end, vmg->flags,
vmg->file, vmg->anon_vma, vmg->policy,
#ifdef CONFIG_USERFAULTFD
@@ -279,7 +296,10 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
(void *)0,
#endif
vmg->anon_name,
- (int)vmg->merge_flags, (int)vmg->state);
+ (int)vmg->state,
+ vmg->just_expand,
+ vmg->__adjust_middle_start, vmg->__adjust_next_start,
+ vmg->__remove_middle, vmg->__remove_next);
if (vmg->mm) {
pr_warn("vmg %px mm:\n", vmg);
@@ -288,13 +308,6 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
pr_warn("vmg %px mm: (NULL)\n", vmg);
}
- if (vmg->vma) {
- pr_warn("vmg %px vma:\n", vmg);
- dump_vma(vmg->vma);
- } else {
- pr_warn("vmg %px vma: (NULL)\n", vmg);
- }
-
if (vmg->prev) {
pr_warn("vmg %px prev:\n", vmg);
dump_vma(vmg->prev);
@@ -302,6 +315,13 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
pr_warn("vmg %px prev: (NULL)\n", vmg);
}
+ if (vmg->middle) {
+ pr_warn("vmg %px middle:\n", vmg);
+ dump_vma(vmg->middle);
+ } else {
+ pr_warn("vmg %px middle: (NULL)\n", vmg);
+ }
+
if (vmg->next) {
pr_warn("vmg %px next:\n", vmg);
dump_vma(vmg->next);
diff --git a/mm/execmem.c b/mm/execmem.c
index 317b6a8d35be..e6c4f5076ca8 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -257,7 +257,6 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
- unsigned long start, end;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- start = (unsigned long)p;
- end = start + alloc_size;
-
- vunmap_range(start, end);
-
- err = execmem_set_direct_map_valid(vm, false);
- if (err)
- goto err_free_mem;
-
- err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
- PMD_SHIFT);
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)
- goto err_free_mem;
+ goto err_reset_direct_map;
return 0;
+err_reset_direct_map:
+ execmem_set_direct_map_valid(vm, true);
err_free_mem:
vfree(p);
return err;
@@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr)
return true;
}
+
+int execmem_make_temp_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 804d7365680c..b5e784f34d98 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,7 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
+#include <linux/sysctl.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -227,15 +227,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
void (*free_folio)(struct folio *);
- int refs = 1;
free_folio = mapping->a_ops->free_folio;
if (free_folio)
free_folio(folio);
- if (folio_test_large(folio))
- refs = folio_nr_pages(folio);
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, folio_nr_pages(folio));
}
/**
@@ -445,7 +442,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
* filemap_fdatawrite_range_kick - start writeback on a range
* @mapping: target address_space
* @start: index to start writeback on
- * @end: last (non-inclusive) index for writeback
+ * @end: last (inclusive) index for writeback
*
* This is a non-integrity writeback helper, to start writing back folios
* for the indicated range.
@@ -860,11 +857,10 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio);
noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
- XA_STATE(xas, &mapping->i_pages, index);
- void *alloced_shadow = NULL;
- int alloced_order = 0;
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
bool huge;
long nr;
+ unsigned int forder = folio_order(folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
@@ -873,7 +869,6 @@ noinline int __filemap_add_folio(struct address_space *mapping,
mapping_set_update(&xas, mapping);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
- xas_set_order(&xas, index, folio_order(folio));
huge = folio_test_hugetlb(folio);
nr = folio_nr_pages(folio);
@@ -883,7 +878,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
folio->index = xas.xa_index;
for (;;) {
- int order = -1, split_order = 0;
+ int order = -1;
void *entry, *old = NULL;
xas_lock_irq(&xas);
@@ -901,21 +896,25 @@ noinline int __filemap_add_folio(struct address_space *mapping,
order = xas_get_order(&xas);
}
- /* entry may have changed before we re-acquire the lock */
- if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
-
if (old) {
- if (order > 0 && order > folio_order(folio)) {
+ if (order > 0 && order > forder) {
+ unsigned int split_order = max(forder,
+ xas_try_split_min_order(order));
+
/* How to handle large swap entries? */
BUG_ON(shmem_mapping(mapping));
- if (!alloced_order) {
- split_order = order;
- goto unlock;
+
+ while (order > forder) {
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, order);
+ if (xas_error(&xas))
+ goto unlock;
+ order = split_order;
+ split_order =
+ max(xas_try_split_min_order(
+ split_order),
+ forder);
}
- xas_split(&xas, old, order);
xas_reset(&xas);
}
if (shadowp)
@@ -939,17 +938,6 @@ noinline int __filemap_add_folio(struct address_space *mapping,
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
@@ -1078,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio)
return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
+/* How many times do we accept lock stealing from under a waiter? */
+static int sysctl_page_lock_unfairness = 5;
+static const struct ctl_table filemap_sysctl_table[] = {
+ {
+ .procname = "page_lock_unfairness",
+ .data = &sysctl_page_lock_unfairness,
+ .maxlen = sizeof(sysctl_page_lock_unfairness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+};
+
void __init pagecache_init(void)
{
int i;
@@ -1086,6 +1087,7 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
+ register_sysctl_init("vm", filemap_sysctl_table);
}
/*
@@ -1233,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
return true;
}
-/* How many times do we accept lock stealing from under a waiter? */
-int sysctl_page_lock_unfairness = 5;
-
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
int state, enum behavior behavior)
{
@@ -1379,7 +1378,7 @@ repeat:
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
- * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
* should be called while holding the ptl for the migration entry referencing
* the page.
@@ -1986,8 +1985,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -2897,8 +2907,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
- while (spliced < size &&
- !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (spliced < size && !pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
@@ -2955,7 +2964,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3015,7 +3024,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
goto out;
}
@@ -3199,14 +3208,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
- /*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't populate our mapping with 0 filled pages that we
- * never emitted an event for.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3275,10 +3276,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
- /* See comment in do_sync_mmap_readahead. */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3338,48 +3335,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
- * filemap_fsnotify_fault - maybe emit a pre-content event.
- * @vmf: struct vm_fault containing details of the fault.
- *
- * If we have a pre-content watch on this file we will emit an event for this
- * range. If we return anything the fault caller should return immediately, we
- * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
- * fault again and then the fault handler will run the second time through.
- *
- * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
- */
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- struct file *fpin = NULL;
- int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
- loff_t pos = vmf->pgoff >> PAGE_SHIFT;
- size_t count = PAGE_SIZE;
- int err;
-
- /*
- * We already did this and now we're retrying with everything locked,
- * don't emit the event and continue.
- */
- if (vmf->flags & FAULT_FLAG_TRIED)
- return 0;
-
- /* No watches, we're done. */
- if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
- return 0;
-
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- return VM_FAULT_SIGBUS;
-
- err = fsnotify_file_area_perm(fpin, mask, &pos, count);
- fput(fpin);
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_RETRY;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
-/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3483,37 +3438,6 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
- * If this is a precontent file we have can now emit an event to
- * try and populate the folio.
- */
- if (!(vmf->flags & FAULT_FLAG_TRIED) &&
- unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
- loff_t pos = folio_pos(folio);
- size_t count = folio_size(folio);
-
- /* We're NOWAIT, we have to retry. */
- if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
- folio_unlock(folio);
- goto out_retry;
- }
-
- if (mapping_locked)
- filemap_invalidate_unlock_shared(mapping);
- mapping_locked = false;
-
- folio_unlock(folio);
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- goto out_retry;
-
- error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
- count);
- if (error)
- ret = VM_FAULT_SIGBUS;
- goto out_retry;
- }
-
- /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
@@ -4170,17 +4094,6 @@ retry:
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
- status = -EFAULT;
- break;
- }
-
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
@@ -4198,6 +4111,12 @@ retry:
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
@@ -4223,6 +4142,16 @@ retry:
bytes = copied;
goto retry;
}
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
} else {
pos += status;
written += status;
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 1d1832e2a599..45540942d148 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page)
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
-void wait_for_stable_page(struct page *page)
-{
- return folio_wait_stable(page_folio(page));
-}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
-
void mark_page_accessed(struct page *page)
{
folio_mark_accessed(page_folio(page));
@@ -90,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);
-
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index)
-{
- return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/gup.c b/mm/gup.c
index 3883b307780e..92351e2fa876 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -96,8 +96,7 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -110,14 +109,13 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
if (is_zero_folio(folio))
return;
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
atomic_sub(refs, &folio->_pincount);
else
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
}
/**
@@ -166,7 +164,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
* Increment the normal page refcount field at least once,
* so that the page really is pinned.
*/
- if (folio_test_large(folio)) {
+ if (folio_has_pincount(folio)) {
folio_ref_add(folio, refs);
atomic_add(refs, &folio->_pincount);
} else {
@@ -225,7 +223,7 @@ void folio_add_pin(struct folio *folio)
* page refcount field at least once, so that the page really is
* pinned.
*/
- if (folio_test_large(folio)) {
+ if (folio_has_pincount(folio)) {
WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
folio_ref_inc(folio);
atomic_inc(&folio->_pincount);
@@ -565,8 +563,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
*/
if (unlikely((flags & FOLL_LONGTERM) &&
!folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
return NULL;
}
@@ -578,7 +575,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
* is pinned. That's why the refcount from the earlier
* try_get_folio() is left intact.
*/
- if (folio_test_large(folio))
+ if (folio_has_pincount(folio))
atomic_add(refs, &folio->_pincount);
else
folio_ref_add(folio,
@@ -1283,6 +1280,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2757,7 +2757,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*
* *) ptes can be read atomically by the architecture.
*
- * *) access_ok is sufficient to validate userspace address ranges.
+ * *) valid user addesses are below TASK_MAX_SIZE
*
* The last two assumptions can be relaxed by the addition of helper functions.
*
@@ -3010,11 +3010,6 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
break;
}
- if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
- break;
- }
-
folio = try_grab_folio_fast(page, 1, flags);
if (!folio) {
gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
@@ -3411,8 +3406,6 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
return -EOVERFLOW;
if (end > TASK_SIZE_MAX)
return -EFAULT;
- if (unlikely(!access_ok((void __user *)start, len)))
- return -EFAULT;
nr_pinned = gup_fast(start, end, gup_flags, pages);
if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
diff --git a/mm/hmm.c b/mm/hmm.c
index 7e0229ae4a5a..082f7b7c0b9e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -248,7 +248,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* just report the PFN.
*/
if (is_device_private_entry(entry) &&
- pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3d3ebdc002d5..2a47682d1ab7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1309,8 +1309,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
struct folio *zero_folio)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return;
entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
@@ -1375,20 +1373,20 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return __do_huge_pmd_anonymous_page(vmf);
}
-static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
pgtable_t pgtable)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
- spinlock_t *ptl;
- ptl = pmd_lock(mm, pmd);
+ lockdep_assert_held(pmd_lockptr(mm, pmd));
+
if (!pmd_none(*pmd)) {
if (write) {
if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1396,7 +1394,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
update_mmu_cache_pmd(vma, addr, pmd);
}
- goto out_unlock;
+ return -EEXIST;
}
entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
@@ -1412,16 +1410,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
- pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
-
-out_unlock:
- spin_unlock(ptl);
- if (pgtable)
- pte_free(mm, pgtable);
+ return 0;
}
/**
@@ -1440,6 +1433,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
+ int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1462,12 +1457,56 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
}
track_pfn_insert(vma, &pgprot, pfn);
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
+ pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(vma->vm_mm, pgtable);
- insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pgtable_t pgtable = NULL;
+ int error;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
+ ptl = pmd_lock(mm, vmf->pmd);
+ if (pmd_none(*vmf->pmd)) {
+ folio_get(folio);
+ folio_add_file_rmap_pmd(folio, &folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
+ }
+ error = insert_pfn_pmd(vma, addr, vmf->pmd,
+ pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot,
+ write, pgtable);
+ spin_unlock(ptl);
+ if (error && pgtable)
+ pte_free(mm, pgtable);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
@@ -1482,19 +1521,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
pgprot_t prot = vma->vm_page_prot;
pud_t entry;
- spinlock_t *ptl;
- ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
- goto out_unlock;
+ return;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- goto out_unlock;
+ return;
}
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
@@ -1508,9 +1545,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
-
-out_unlock:
- spin_unlock(ptl);
}
/**
@@ -1528,6 +1562,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+ spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1545,10 +1580,57 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
track_pfn_insert(vma, &pgprot, pfn);
+ ptl = pud_lock(vma->vm_mm, vmf->pud);
insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
+ spin_unlock(ptl);
+
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
+ * @vmf: Structure describing the fault
+ * @folio: folio to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
+ bool write)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long addr = vmf->address & PUD_MASK;
+ pud_t *pud = vmf->pud;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
+
+ /*
+ * If there is already an entry present we assume the folio is
+ * already mapped, hence no need to take another reference. We
+ * still call insert_pfn_pud() though in case the mapping needs
+ * upgrading to writeable.
+ */
+ if (pud_none(*vmf->pud)) {
+ folio_get(folio);
+ folio_add_file_rmap_pud(folio, &folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+ }
+ insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
+ write);
+ spin_unlock(ptl);
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1698,7 +1780,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_folio = page_folio(src_page);
folio_get(src_folio);
- if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
/* Page maybe pinned: split and retry the fault on PTEs. */
folio_put(src_folio);
pte_free(dst_mm, pgtable);
@@ -2071,7 +2153,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this folio, we couldn't discard
* the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto out;
if (!folio_trylock(folio))
@@ -2141,12 +2223,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else if (is_huge_zero_pmd(orig_pmd)) {
- zap_deposited_table(tlb->mm, pmd);
+ if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
} else {
struct folio *folio = NULL;
@@ -2646,12 +2729,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
- /* No support for anonymous PUD pages yet */
- BUG();
+ struct page *page = NULL;
+ struct folio *folio;
+
+ /* No support for anonymous PUD pages or migration yet */
+ VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
+ !pud_present(orig_pud));
+
+ page = pud_page(orig_pud);
+ folio = page_folio(page);
+ folio_remove_rmap_pud(folio, page, vma);
+ add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+ spin_unlock(ptl);
+ tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
}
return 1;
}
@@ -2659,6 +2754,10 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
+ struct folio *folio;
+ struct page *page;
+ pud_t old_pud;
+
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
@@ -2666,7 +2765,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
count_vm_event(THP_SPLIT_PUD);
- pudp_huge_clear_flush(vma, haddr, pud);
+ old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+
+ if (!vma_is_dax(vma))
+ return;
+
+ page = pud_page(old_pud);
+ folio = page_folio(page);
+
+ if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+ folio_mark_dirty(folio);
+ if (!folio_test_referenced(folio) && pud_young(old_pud))
+ folio_set_referenced(folio);
+ folio_remove_rmap_pud(folio, page, vma);
+ folio_put(folio);
+ add_mm_counter(vma->vm_mm, mm_counter_file(folio),
+ -HPAGE_PUD_NR);
}
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
@@ -2766,13 +2880,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_special_huge(vma))
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
folio = pfn_swap_entry_folio(entry);
+ } else if (is_huge_zero_pmd(old_pmd)) {
+ return;
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
@@ -3017,9 +3133,9 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
+ unsigned long start,
+ unsigned long end,
+ struct vm_area_struct *next)
{
/* Check if we need to split start first. */
split_huge_pmd_if_needed(vma, start);
@@ -3027,16 +3143,9 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/* Check if we need to split end next. */
split_huge_pmd_if_needed(vma, end);
- /*
- * If we're also updating the next vma vm_start,
- * check if we need to split it.
- */
- if (adjust_next > 0) {
- struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
- unsigned long nstart = next->vm_start;
- nstart += adjust_next;
- split_huge_pmd_if_needed(next, nstart);
- }
+ /* If we're incrementing next->vm_start, we might need to split it. */
+ if (next)
+ split_huge_pmd_if_needed(next, end);
}
static void unmap_folio(struct folio *folio)
@@ -3070,8 +3179,12 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
int ref_count, map_count;
pmd_t orig_pmd = *pmdp;
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
return false;
+ }
orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
@@ -3098,8 +3211,15 @@ static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
*
* The only folio refs must be one from isolation plus the rmap(s).
*/
- if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
- ref_count != map_count + 1) {
+ if (pmd_dirty(orig_pmd))
+ folio_set_dirty(folio);
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ folio_set_swapbacked(folio);
+ set_pmd_at(mm, addr, pmdp, orig_pmd);
+ return false;
+ }
+
+ if (ref_count != map_count + 1) {
set_pmd_at(mm, addr, pmdp, orig_pmd);
return false;
}
@@ -3119,12 +3239,11 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
{
VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
- if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
- return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
-
- return false;
+ return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
}
static void remap_page(struct folio *folio, unsigned long nr, int flags)
@@ -3143,225 +3262,378 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags)
}
}
-static void lru_add_page_tail(struct folio *folio, struct page *tail,
+static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
struct lruvec *lruvec, struct list_head *list)
{
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- VM_BUG_ON_FOLIO(PageLRU(tail), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
- get_page(tail);
- list_add_tail(&tail->lru, list);
+ folio_get(new_folio);
+ list_add_tail(&new_folio->lru, list);
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!folio_test_lru(folio));
if (folio_test_unevictable(folio))
- tail->mlock_count = 0;
+ new_folio->mlock_count = 0;
else
- list_add_tail(&tail->lru, &folio->lru);
- SetPageLRU(tail);
+ list_add_tail(&new_folio->lru, &folio->lru);
+ folio_set_lru(new_folio);
}
}
-static void __split_huge_page_tail(struct folio *folio, int tail,
- struct lruvec *lruvec, struct list_head *list,
- unsigned int new_order)
+/* Racy check whether the huge page can be split */
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
- struct page *head = &folio->page;
- struct page *page_tail = head + tail;
- /*
- * Careful: new_folio is not a "real" folio before we cleared PageTail.
- * Don't pass it around before clear_compound_head().
- */
- struct folio *new_folio = (struct folio *)page_tail;
+ int extra_pins;
+
+ /* Additional pins from page cache */
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
+ else
+ extra_pins = folio_nr_pages(folio);
+ if (pextra_pins)
+ *pextra_pins = extra_pins;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
+ caller_pins;
+}
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+/*
+ * It splits @folio into @new_order folios and copies the @folio metadata to
+ * all the resulting folios.
+ */
+static void __split_folio_to_order(struct folio *folio, int old_order,
+ int new_order)
+{
+ long new_nr_pages = 1 << new_order;
+ long nr_pages = 1 << old_order;
+ long i;
/*
- * Clone page flags before unfreezing refcount.
- *
- * After successful get_page_unless_zero() might follow flags change,
- * for example lock_page() which set PG_waiters.
- *
- * Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
- * the migration entry instead from where remap_page() will restore it.
- * We can still have PG_anon_exclusive set on effectively unmapped and
- * unreferenced sub-pages of an anonymous THP: we can simply drop
- * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ * Skip the first new_nr_pages, since the new folio from them have all
+ * the flags from the original folio.
*/
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (head->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_swapcache) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_workingset) |
- (1L << PG_locked) |
- (1L << PG_unevictable) |
+ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
+ struct page *new_head = &folio->page + i;
+
+ /*
+ * Careful: new_folio is not a "real" folio before we cleared PageTail.
+ * Don't pass it around before clear_compound_head().
+ */
+ struct folio *new_folio = (struct folio *)new_head;
+
+ VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);
+
+ /*
+ * Clone page flags before unfreezing refcount.
+ *
+ * After successful get_page_unless_zero() might follow flags change,
+ * for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
+ */
+ new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags |= (folio->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_swapcache) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_workingset) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
- (1L << PG_arch_2) |
+ (1L << PG_arch_2) |
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
- (1L << PG_arch_3) |
+ (1L << PG_arch_3) |
#endif
- (1L << PG_dirty) |
- LRU_GEN_MASK | LRU_REFS_MASK));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
- /* ->mapping in first and second tail page is replaced by other uses */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
- new_folio->mapping = folio->mapping;
- new_folio->index = folio->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + i;
- /*
- * page->private should not be set in tail pages. Fix up and warn once
- * if private is unexpectedly set.
- */
- if (unlikely(page_tail->private)) {
- VM_WARN_ON_ONCE_PAGE(true, page_tail);
- page_tail->private = 0;
- }
- if (folio_test_swapcache(folio))
- new_folio->swap.val = folio->swap.val + tail;
+ /*
+ * page->private should not be set in tail pages. Fix up and warn once
+ * if private is unexpectedly set.
+ */
+ if (unlikely(new_folio->private)) {
+ VM_WARN_ON_ONCE_PAGE(true, new_head);
+ new_folio->private = NULL;
+ }
- /* Page flags must be visible before we make the page non-compound. */
- smp_wmb();
+ if (folio_test_swapcache(folio))
+ new_folio->swap.val = folio->swap.val + i;
- /*
- * Clear PageTail before unfreezing page refcount.
- *
- * After successful get_page_unless_zero() might follow put_page()
- * which needs correct compound_head().
- */
- clear_compound_head(page_tail);
- if (new_order) {
- prep_compound_page(page_tail, new_order);
- folio_set_large_rmappable(new_folio);
- }
+ /* Page flags must be visible before we make the page non-compound. */
+ smp_wmb();
- /* Finally unfreeze refcount. Additional reference from page cache. */
- page_ref_unfreeze(page_tail,
- 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
- folio_nr_pages(new_folio) : 0));
+ /*
+ * Clear PageTail before unfreezing page refcount.
+ *
+ * After successful get_page_unless_zero() might follow put_page()
+ * which needs correct compound_head().
+ */
+ clear_compound_head(new_head);
+ if (new_order) {
+ prep_compound_page(new_head, new_order);
+ folio_set_large_rmappable(new_folio);
+ }
- if (folio_test_young(folio))
- folio_set_young(new_folio);
- if (folio_test_idle(folio))
- folio_set_idle(new_folio);
+ if (folio_test_young(folio))
+ folio_set_young(new_folio);
+ if (folio_test_idle(folio))
+ folio_set_idle(new_folio);
+#ifdef CONFIG_MEMCG
+ new_folio->memcg_data = folio->memcg_data;
+#endif
- folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
+ }
- /*
- * always add to the tail because some iterators expect new
- * pages to show after the currently processed elements - e.g.
- * migrate_pages
- */
- lru_add_page_tail(folio, page_tail, lruvec, list);
+ if (new_order)
+ folio_set_order(folio, new_order);
+ else
+ ClearPageCompound(&folio->page);
}
-static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned int new_order)
+/*
+ * It splits an unmapped @folio to lower order smaller folios in two ways.
+ * @folio: the to-be-split folio
+ * @new_order: the smallest order of the after split folios (since buddy
+ * allocator like split generates folios with orders from @folio's
+ * order - 1 to new_order).
+ * @split_at: in buddy allocator like split, the folio containing @split_at
+ * will be split until its order becomes @new_order.
+ * @lock_at: the folio containing @lock_at is left locked for caller.
+ * @list: the after split folios will be added to @list if it is not NULL,
+ * otherwise to LRU lists.
+ * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
+ * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
+ * @mapping: @folio->mapping
+ * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ *
+ *
+ * 1. uniform split: the given @folio into multiple @new_order small folios,
+ * where all small folios have the same order. This is done when
+ * uniform_split is true.
+ * 2. buddy allocator like (non-uniform) split: the given @folio is split into
+ * half and one of the half (containing the given page) is split into half
+ * until the given @page's order becomes @new_order. This is done when
+ * uniform_split is false.
+ *
+ * The high level flow for these two methods are:
+ * 1. uniform split: a single __split_folio_to_order() is called to split the
+ * @folio into @new_order, then we traverse all the resulting folios one by
+ * one in PFN ascending order and perform stats, unfreeze, adding to list,
+ * and file mapping index operations.
+ * 2. non-uniform split: in general, folio_order - @new_order calls to
+ * __split_folio_to_order() are made in a for loop to split the @folio
+ * to one lower order at a time. The resulting small folios are processed
+ * like what is done during the traversal in 1, except the one containing
+ * @page, which is split in next for loop.
+ *
+ * After splitting, the caller's folio reference will be transferred to the
+ * folio containing @page. The other folios may be freed if they are not mapped.
+ *
+ * In terms of locking, after splitting,
+ * 1. uniform split leaves @page (or the folio contains it) locked;
+ * 2. buddy allocator like (non-uniform) split leaves @folio locked.
+ *
+ *
+ * For !uniform_split, when -ENOMEM is returned, the original folio might be
+ * split. The caller needs to check the input folio.
+ */
+static int __split_unmapped_folio(struct folio *folio, int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, pgoff_t end,
+ struct xa_state *xas, struct address_space *mapping,
+ bool uniform_split)
{
- struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
- unsigned long offset = 0;
- int i, nr_dropped = 0;
- unsigned int new_nr = 1 << new_order;
+ struct folio *origin_folio = folio;
+ struct folio *next_folio = folio_next(folio);
+ struct folio *new_folio;
+ struct folio *next;
int order = folio_order(folio);
- unsigned int nr = 1 << order;
+ int split_order;
+ int start_order = uniform_split ? new_order : order - 1;
+ int nr_dropped = 0;
+ int ret = 0;
+ bool stop_split = false;
+
+ if (folio_test_swapcache(folio)) {
+ VM_BUG_ON(mapping);
- /* complete memcg works before add pages to LRU */
- split_page_memcg(head, order, new_order);
+ /* a swapcache folio can only be uniformly split to order-0 */
+ if (!uniform_split || new_order != 0)
+ return -EINVAL;
- if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
- offset = swap_cache_index(folio->swap);
swap_cache = swap_address_space(folio->swap);
xa_lock(&swap_cache->i_pages);
}
+ if (folio_test_anon(folio))
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = folio_lruvec_lock(folio);
folio_clear_has_hwpoisoned(folio);
- for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
- struct folio *tail;
- __split_huge_page_tail(folio, i, lruvec, list, new_order);
- tail = page_folio(head + i);
- /* Some pages can be beyond EOF: drop them from page cache */
- if (tail->index >= end) {
- if (shmem_mapping(folio->mapping))
- nr_dropped++;
- else if (folio_test_clear_dirty(tail))
- folio_account_cleaned(tail,
- inode_to_wb(folio->mapping->host));
- __filemap_remove_folio(tail, NULL);
- folio_put(tail);
- } else if (!folio_test_anon(folio)) {
- __xa_store(&folio->mapping->i_pages, tail->index,
- tail, 0);
- } else if (swap_cache) {
- __xa_store(&swap_cache->i_pages, offset + i,
- tail, 0);
+ /*
+ * split to new_order one order at a time. For uniform split,
+ * folio is split to new_order directly.
+ */
+ for (split_order = start_order;
+ split_order >= new_order && !stop_split;
+ split_order--) {
+ int old_order = folio_order(folio);
+ struct folio *release;
+ struct folio *end_folio = folio_next(folio);
+
+ /* order-1 anonymous folio is not supported */
+ if (folio_test_anon(folio) && split_order == 1)
+ continue;
+ if (uniform_split && split_order != new_order)
+ continue;
+
+ if (mapping) {
+ /*
+ * uniform split has xas_split_alloc() called before
+ * irq is disabled to allocate enough memory, whereas
+ * non-uniform split can handle ENOMEM.
+ */
+ if (uniform_split)
+ xas_split(xas, folio, old_order);
+ else {
+ xas_set_order(xas, folio->index, split_order);
+ xas_try_split(xas, folio, old_order);
+ if (xas_error(xas)) {
+ ret = xas_error(xas);
+ stop_split = true;
+ goto after_split;
+ }
+ }
}
- }
- if (!new_order)
- ClearPageCompound(head);
- else {
- struct folio *new_folio = (struct folio *)head;
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
- folio_set_order(new_folio, new_order);
- }
- unlock_page_lruvec(lruvec);
- /* Caller disabled irqs, so they are still disabled here */
+ __split_folio_to_order(folio, old_order, split_order);
- split_page_owner(head, order, new_order);
- pgalloc_tag_split(folio, order, new_order);
+after_split:
+ /*
+ * Iterate through after-split folios and perform related
+ * operations. But in buddy allocator like split, the folio
+ * containing the specified page is skipped until its order
+ * is new_order, since the folio will be worked on in next
+ * iteration.
+ */
+ for (release = folio; release != end_folio; release = next) {
+ next = folio_next(release);
+ /*
+ * for buddy allocator like split, the folio containing
+ * page will be split next and should not be released,
+ * until the folio's order is new_order or stop_split
+ * is set to true by the above xas_split() failure.
+ */
+ if (release == page_folio(split_at)) {
+ folio = release;
+ if (split_order != new_order && !stop_split)
+ continue;
+ }
+ if (folio_test_anon(release)) {
+ mod_mthp_stat(folio_order(release),
+ MTHP_STAT_NR_ANON, 1);
+ }
- /* See comment in __split_huge_page_tail() */
- if (folio_test_anon(folio)) {
- /* Additional pin to swap cache */
- if (folio_test_swapcache(folio)) {
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&swap_cache->i_pages);
- } else {
- folio_ref_inc(folio);
+ /*
+ * origin_folio should be kept frozon until page cache
+ * entries are updated with all the other after-split
+ * folios to prevent others seeing stale page cache
+ * entries.
+ */
+ if (release == origin_folio)
+ continue;
+
+ folio_ref_unfreeze(release, 1 +
+ ((mapping || swap_cache) ?
+ folio_nr_pages(release) : 0));
+
+ lru_add_split_folio(origin_folio, release, lruvec,
+ list);
+
+ /* Some pages can be beyond EOF: drop them from cache */
+ if (release->index >= end) {
+ if (shmem_mapping(mapping))
+ nr_dropped += folio_nr_pages(release);
+ else if (folio_test_clear_dirty(release))
+ folio_account_cleaned(release,
+ inode_to_wb(mapping->host));
+ __filemap_remove_folio(release, NULL);
+ folio_put_refs(release, folio_nr_pages(release));
+ } else if (mapping) {
+ __xa_store(&mapping->i_pages,
+ release->index, release, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages,
+ swap_cache_index(release->swap),
+ release, 0);
+ }
}
- } else {
- /* Additional pin to page cache */
- folio_ref_add(folio, 1 + new_nr);
- xa_unlock(&folio->mapping->i_pages);
}
+
+ /*
+ * Unfreeze origin_folio only after all page cache entries, which used
+ * to point to it, have been updated with new folios. Otherwise,
+ * a parallel folio_try_get() can grab origin_folio and its caller can
+ * see stale page cache entries.
+ */
+ folio_ref_unfreeze(origin_folio, 1 +
+ ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0));
+
+ unlock_page_lruvec(lruvec);
+
+ if (swap_cache)
+ xa_unlock(&swap_cache->i_pages);
+ if (mapping)
+ xa_unlock(&mapping->i_pages);
+
+ /* Caller disabled irqs, so they are still disabled here */
local_irq_enable();
if (nr_dropped)
- shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
+ shmem_uncharge(mapping->host, nr_dropped);
+
+ remap_page(origin_folio, 1 << order,
+ folio_test_anon(origin_folio) ?
+ RMP_USE_SHARED_ZEROPAGE : 0);
/*
- * set page to its compound_head when split to non order-0 pages, so
- * we can skip unlocking it below, since PG_locked is transferred to
- * the compound_head of the page and the caller will unlock it.
+ * At this point, folio should contain the specified page.
+ * For uniform split, it is left for caller to unlock.
+ * For buddy allocator like split, the first after-split folio is left
+ * for caller to unlock.
*/
- if (new_order)
- page = compound_head(page);
-
- for (i = 0; i < nr; i += new_nr) {
- struct page *subpage = head + i;
- struct folio *new_folio = page_folio(subpage);
- if (subpage == page)
+ for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) {
+ next = folio_next(new_folio);
+ if (new_folio == page_folio(lock_at))
continue;
- folio_unlock(new_folio);
+ folio_unlock(new_folio);
/*
* Subpages may be freed if there wasn't any mapping
* like if add_to_swap() is running on a lru page that
@@ -3369,81 +3641,90 @@ static void __split_huge_page(struct page *page, struct list_head *list,
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
- free_page_and_swap_cache(subpage);
+ free_page_and_swap_cache(&new_folio->page);
}
+ return ret;
}
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
+bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
{
- int extra_pins;
+ if (folio_test_anon(folio)) {
+ /* order-1 is not supported for anonymous THP. */
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ return new_order != 1;
+ } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ /*
+ * No split if the file system does not support large folio.
+ * Note that we might still have THPs in such mappings due to
+ * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
+ * does not actually support large folios properly.
+ */
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
- /* Additional pins from page cache */
- if (folio_test_anon(folio))
- extra_pins = folio_test_swapcache(folio) ?
- folio_nr_pages(folio) : 0;
- else
- extra_pins = folio_nr_pages(folio);
- if (pextra_pins)
- *pextra_pins = extra_pins;
- return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
- caller_pins;
+ /* Only swapping a whole PMD-mapped folio is supported */
+ if (folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
+}
+
+/* See comments in non_uniform_split_supported() */
+bool uniform_split_supported(struct folio *folio, unsigned int new_order,
+ bool warns)
+{
+ if (folio_test_anon(folio)) {
+ VM_WARN_ONCE(warns && new_order == 1,
+ "Cannot split to order-1 folio");
+ return new_order != 1;
+ } else if (new_order) {
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
+ }
+
+ if (new_order && folio_test_swapcache(folio)) {
+ VM_WARN_ONCE(warns,
+ "Cannot split swapcache folio to non-0 order");
+ return false;
+ }
+
+ return true;
}
/*
- * This function splits a large folio into smaller folios of order @new_order.
- * @page can point to any page of the large folio to split. The split operation
- * does not change the position of @page.
- *
- * Prerequisites:
- *
- * 1) The caller must hold a reference on the @page's owning folio, also known
- * as the large folio.
+ * __folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ * @lock_at: a page within @folio to be left locked to caller
+ * @list: after-split folios will be put on it if non NULL
+ * @uniform_split: perform uniform split or not (non-uniform split)
*
- * 2) The large folio must be locked.
+ * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
+ * It is in charge of checking whether the split is supported or not and
+ * preparing @folio for __split_unmapped_folio().
*
- * 3) The folio must not be pinned. Any unexpected folio references, including
- * GUP pins, will result in the folio not getting split; instead, the caller
- * will receive an -EAGAIN.
- *
- * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
- * supported for non-file-backed folios, because folio->_deferred_list, which
- * is used by partially mapped folios, is stored in subpage 2, but an order-1
- * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
- * since they do not use _deferred_list.
- *
- * After splitting, the caller's folio reference will be transferred to @page,
- * resulting in a raised refcount of @page after this call. The other pages may
- * be freed if they are not mapped.
- *
- * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
- *
- * Pages in @new_order will inherit the mapping, flags, and so on from the
- * huge page.
- *
- * Returns 0 if the huge page was split successfully.
- *
- * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
- * the folio was concurrently removed from the page cache.
- *
- * Returns -EBUSY when trying to split the huge zeropage, if the folio is
- * under writeback, if fs-specific folio metadata cannot currently be
- * released, or if some unexpected race happened (e.g., anon VMA disappeared,
- * truncation).
- *
- * Callers should ensure that the order respects the address space mapping
- * min-order if one is set for non-anonymous folios.
- *
- * Returns -EINVAL when trying to split to an order that is incompatible
- * with the folio. Splitting to order 0 is compatible with all folios.
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
- unsigned int new_order)
+static int __folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct page *lock_at,
+ struct list_head *list, bool uniform_split)
{
- struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
- /* reset xarray order to new order after split */
- XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
@@ -3455,38 +3736,17 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ if (folio != page_folio(split_at) || folio != page_folio(lock_at))
+ return -EINVAL;
+
if (new_order >= folio_order(folio))
return -EINVAL;
- if (is_anon) {
- /* order-1 is not supported for anonymous THP. */
- if (new_order == 1) {
- VM_WARN_ONCE(1, "Cannot split to order-1 folio");
- return -EINVAL;
- }
- } else if (new_order) {
- /* Split shmem folio to non-zero order not supported */
- if (shmem_mapping(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split shmem folio to non-0 order");
- return -EINVAL;
- }
- /*
- * No split if the file system does not support large folio.
- * Note that we might still have THPs in such mappings due to
- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
- * does not actually support large folios properly.
- */
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- VM_WARN_ONCE(1,
- "Cannot split file folio to non-0 order");
- return -EINVAL;
- }
- }
+ if (uniform_split && !uniform_split_supported(folio, new_order, true))
+ return -EINVAL;
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio) && new_order)
+ if (!uniform_split &&
+ !non_uniform_split_supported(folio, new_order, true))
return -EINVAL;
is_hzp = is_huge_zero_folio(folio);
@@ -3522,6 +3782,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
mapping = folio->mapping;
/* Truncated ? */
+ /*
+ * TODO: add support for large shmem folio in swap cache.
+ * When shmem is in swap cache, mapping is NULL and
+ * folio_test_swapcache() is true.
+ */
if (!mapping) {
ret = -EBUSY;
goto out;
@@ -3543,21 +3808,24 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
goto out;
}
- xas_split_alloc(&xas, folio, folio_order(folio), gfp);
- if (xas_error(&xas)) {
- ret = xas_error(&xas);
- goto out;
+ if (uniform_split) {
+ xas_set_order(&xas, folio->index, new_order);
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
}
anon_vma = NULL;
i_mmap_lock_read(mapping);
/*
- *__split_huge_page() may need to trim off pages beyond EOF:
- * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
- * which cannot be nested inside the page tree lock. So note
- * end now: i_size itself may be changed at any moment, but
- * folio lock is good enough to serialize the trimming.
+ *__split_unmapped_folio() may need to trim off pages beyond
+ * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
+ * seqlock, which cannot be nested inside the page tree lock.
+ * So note end now: i_size itself may be changed at any moment,
+ * but folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -3611,7 +3879,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (mapping) {
int nr = folio_nr_pages(folio);
- xas_split(&xas, folio, folio_order(folio));
if (folio_test_pmd_mappable(folio) &&
new_order < HPAGE_PMD_ORDER) {
if (folio_test_swapbacked(folio)) {
@@ -3625,12 +3892,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
}
}
- if (is_anon) {
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
- }
- __split_huge_page(page, list, end, new_order);
- ret = 0;
+ ret = __split_unmapped_folio(folio, new_order,
+ split_at, lock_at, list, end, &xas, mapping,
+ uniform_split);
} else {
spin_unlock(&ds_queue->split_queue_lock);
fail:
@@ -3656,6 +3920,90 @@ out:
return ret;
}
+/*
+ * This function splits a large folio into smaller folios of order @new_order.
+ * @page can point to any page of the large folio to split. The split operation
+ * does not change the position of @page.
+ *
+ * Prerequisites:
+ *
+ * 1) The caller must hold a reference on the @page's owning folio, also known
+ * as the large folio.
+ *
+ * 2) The large folio must be locked.
+ *
+ * 3) The folio must not be pinned. Any unexpected folio references, including
+ * GUP pins, will result in the folio not getting split; instead, the caller
+ * will receive an -EAGAIN.
+ *
+ * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
+ * supported for non-file-backed folios, because folio->_deferred_list, which
+ * is used by partially mapped folios, is stored in subpage 2, but an order-1
+ * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
+ * since they do not use _deferred_list.
+ *
+ * After splitting, the caller's folio reference will be transferred to @page,
+ * resulting in a raised refcount of @page after this call. The other pages may
+ * be freed if they are not mapped.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Pages in @new_order will inherit the mapping, flags, and so on from the
+ * huge page.
+ *
+ * Returns 0 if the huge page was split successfully.
+ *
+ * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
+ * the folio was concurrently removed from the page cache.
+ *
+ * Returns -EBUSY when trying to split the huge zeropage, if the folio is
+ * under writeback, if fs-specific folio metadata cannot currently be
+ * released, or if some unexpected race happened (e.g., anon VMA disappeared,
+ * truncation).
+ *
+ * Callers should ensure that the order respects the address space mapping
+ * min-order if one is set for non-anonymous folios.
+ *
+ * Returns -EINVAL when trying to split to an order that is incompatible
+ * with the folio. Splitting to order 0 is compatible with all folios.
+ */
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
+{
+ struct folio *folio = page_folio(page);
+
+ return __folio_split(folio, new_order, &folio->page, page, list, true);
+}
+
+/*
+ * folio_split: split a folio at @split_at to a @new_order folio
+ * @folio: folio to split
+ * @new_order: the order of the new folio
+ * @split_at: a page within the new folio
+ *
+ * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
+ *
+ * It has the same prerequisites and returns as
+ * split_huge_page_to_list_to_order().
+ *
+ * Split a folio at @split_at to a new_order folio, leave the
+ * remaining subpages of the original folio as large as possible. For example,
+ * in the case of splitting an order-9 folio at its third order-3 subpages to
+ * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
+ * After the split, there will be a group of folios with different orders and
+ * the new folio containing @split_at is marked in bracket:
+ * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
+ *
+ * After split, folio is left locked for caller.
+ */
+int folio_split(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct list_head *list)
+{
+ return __folio_split(folio, new_order, split_at, &folio->page, list,
+ false);
+}
+
int min_order_for_split(struct folio *folio)
{
if (folio_test_anon(folio))
@@ -3740,7 +4088,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
/*
* Exclude swapcache: originally to avoid a corrupt deferred split
- * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * queue. Nowadays that is fully prevented by memcg1_swapout();
* but if page reclaim is already handling the same folio, it is
* unnecessary to handle it again in the shrinker, so excluding
* swapcache here may still be a useful optimization.
@@ -3975,7 +4323,8 @@ static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
}
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
- unsigned long vaddr_end, unsigned int new_order)
+ unsigned long vaddr_end, unsigned int new_order,
+ long in_folio_offset)
{
int ret = 0;
struct task_struct *task;
@@ -4059,8 +4408,16 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
if (!folio_test_anon(folio) && folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 ||
+ in_folio_offset >= folio_nr_pages(folio)) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
@@ -4083,7 +4440,8 @@ out:
}
static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
- pgoff_t off_end, unsigned int new_order)
+ pgoff_t off_end, unsigned int new_order,
+ long in_folio_offset)
{
struct filename *file;
struct file *candidate;
@@ -4132,8 +4490,15 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (folio->mapping != mapping)
goto unlock;
- if (!split_folio_to_order(folio, target_order))
- split++;
+ if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
+ if (!split_folio_to_order(folio, target_order))
+ split++;
+ } else {
+ struct page *split_at = folio_page(folio,
+ in_folio_offset);
+ if (!folio_split(folio, target_order, split_at, NULL))
+ split++;
+ }
unlock:
folio_unlock(folio);
@@ -4166,6 +4531,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
int pid;
unsigned long vaddr_start, vaddr_end;
unsigned int new_order = 0;
+ long in_folio_offset = -1;
ret = mutex_lock_interruptible(&split_debug_mutex);
if (ret)
@@ -4194,30 +4560,33 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
goto out;
}
- ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start,
- &off_end, &new_order);
- if (ret != 2 && ret != 3) {
+ ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
+ &new_order, &in_folio_offset);
+ if (ret != 2 && ret != 3 && ret != 4) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
+ ret = split_huge_pages_in_file(file_path, off_start, off_end,
+ new_order, in_folio_offset);
if (!ret)
ret = input_len;
goto out;
}
- ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
+ ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
+ &vaddr_end, &new_order, &in_folio_offset);
if (ret == 1 && pid == 1) {
split_huge_pages_all();
ret = strlen(input_buf);
goto out;
- } else if (ret != 3 && ret != 4) {
+ } else if (ret != 3 && ret != 4 && ret != 5) {
ret = -EINVAL;
goto out;
}
- ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
+ ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
+ in_folio_offset);
if (!ret)
ret = strlen(input_buf);
out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65068671e460..39f92aad7bd1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,9 +14,11 @@
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
+#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
+#include <linux/minmax.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
@@ -40,6 +42,7 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/setup.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
@@ -48,19 +51,33 @@
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
+#include "hugetlb_cma.h"
#include <linux/page-isolation.h>
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
-#ifdef CONFIG_CMA
-static struct cma *hugetlb_cma[MAX_NUMNODES];
-static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-#endif
-static unsigned long hugetlb_cma_size __initdata;
-
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
+static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
+
+/*
+ * Due to ordering constraints across the init code for various
+ * architectures, hugetlb hstate cmdline parameters can't simply
+ * be early_param. early_param might call the setup function
+ * before valid hugetlb page sizes are determined, leading to
+ * incorrect rejection of valid hugepagesz= options.
+ *
+ * So, record the parameters early and consume them whenever the
+ * init code is ready for them, by calling hugetlb_parse_params().
+ */
+
+/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
+#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1)
+struct hugetlb_cmdline {
+ char *val;
+ int (*setup)(char *val);
+};
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
@@ -68,6 +85,21 @@ static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
+static unsigned long hugepage_allocation_threads __initdata;
+
+static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
+static int hstate_cmdline_index __initdata;
+static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
+static int hugetlb_param_index __initdata;
+static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
+static __init void hugetlb_parse_params(void);
+
+#define hugetlb_early_param(str, func) \
+static __init int func##args(char *s) \
+{ \
+ return hugetlb_add_param(s, func); \
+} \
+early_param(str, func##args)
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -93,12 +125,11 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
{
-#ifdef CONFIG_CMA
- int nid = folio_nid(folio);
-
- if (cma_free_folio(hugetlb_cma[nid], folio))
+ if (folio_test_hugetlb_cma(folio)) {
+ hugetlb_cma_free_folio(folio);
return;
-#endif
+ }
+
folio_put(folio);
}
@@ -1455,27 +1486,11 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
retry:
- folio = NULL;
-#ifdef CONFIG_CMA
- {
- int node;
-
- if (hugetlb_cma[nid])
- folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
-
- if (!folio && !(gfp_mask & __GFP_THISNODE)) {
- for_each_node_mask(node, *nodemask) {
- if (node == nid || !hugetlb_cma[node])
- continue;
-
- folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
- if (folio)
- break;
- }
- }
- }
-#endif
+ folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
if (!folio) {
+ if (hugetlb_cma_exclusive_alloc())
+ return NULL;
+
folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
if (!folio)
return NULL;
@@ -1634,7 +1649,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
folio_ref_unfreeze(folio, 1);
- INIT_LIST_HEAD(&folio->_deferred_list);
hugetlb_free_folio(folio);
}
@@ -2135,6 +2149,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2157,7 +2173,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2177,7 +2195,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
@@ -2241,12 +2259,21 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
return NULL;
+ hugetlb_vmemmap_optimize_folio(h, folio);
+
spin_lock_irq(&hugetlb_lock);
/*
+ * nr_huge_pages needs to be adjusted within the same lock cycle
+ * as surplus_pages, otherwise it might confuse
+ * persistent_huge_pages() momentarily.
+ */
+ __prep_account_new_huge_page(h, nid);
+
+ /*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
@@ -2943,6 +2970,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
typedef enum {
/*
* For either 0/1: we checked the per-vma resv map, and one resv
@@ -3136,6 +3171,56 @@ out_end_reservation:
return ERR_PTR(-ENOSPC);
}
+static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
+{
+ struct huge_bootmem_page *m;
+ int listnode = nid;
+
+ if (hugetlb_early_cma(h))
+ m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
+ else {
+ if (node_exact)
+ m = memblock_alloc_exact_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ else {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ /*
+ * For pre-HVO to work correctly, pages need to be on
+ * the list for the node they were actually allocated
+ * from. That node may be different in the case of
+ * fallback by memblock_alloc_try_nid_raw. So,
+ * extract the actual node first.
+ */
+ if (m)
+ listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
+ }
+
+ if (m) {
+ m->flags = 0;
+ m->cma = NULL;
+ }
+ }
+
+ if (m) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ *
+ * Put them into a private list first because mem_map
+ * is not up yet.
+ */
+ INIT_LIST_HEAD(&m->list);
+ list_add(&m->list, &huge_boot_pages[listnode]);
+ m->hstate = h;
+ }
+
+ return m;
+}
+
int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
@@ -3145,22 +3230,15 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
- m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ m = alloc_bootmem(h, node, true);
if (!m)
return 0;
goto found;
}
+
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
- m = memblock_alloc_try_nid_raw(
- huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
+ m = alloc_bootmem(h, node, false);
if (!m)
return 0;
goto found;
@@ -3177,10 +3255,7 @@ found:
*/
memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
huge_page_size(h) - PAGE_SIZE);
- /* Put them into a private list first because mem_map is not up yet */
- INIT_LIST_HEAD(&m->list);
- list_add(&m->list, &huge_boot_pages[node]);
- m->hstate = h;
+
return 1;
}
@@ -3198,7 +3273,6 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
- __ClearPageReserved(folio_page(folio, pfn - head_pfn));
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
ret = page_ref_freeze(page, 1);
@@ -3222,6 +3296,42 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
prep_compound_head((struct page *)folio, huge_page_order(h));
}
+static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_HVO;
+}
+
+static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_CMA;
+}
+
+/*
+ * memblock-allocated pageblocks might not have the migrate type set
+ * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
+ * here, or MIGRATE_CMA if this was a page allocated through an early CMA
+ * reservation.
+ *
+ * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
+ * read-only, but that's ok - for sparse vmemmap this does not write to
+ * the page structure.
+ */
+static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
+ struct hstate *h)
+{
+ unsigned long nr_pages = pages_per_huge_page(h), i;
+
+ WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
+
+ for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
+ if (folio_test_hugetlb_cma(folio))
+ init_cma_pageblock(folio_page(folio, i));
+ else
+ set_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE);
+ }
+}
+
static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct list_head *folio_list)
{
@@ -3229,7 +3339,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct folio *folio, *tmp_f;
/* Send list for bulk vmemmap optimization processing */
- hugetlb_vmemmap_optimize_folios(h, folio_list);
+ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
@@ -3243,6 +3353,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
HUGETLB_VMEMMAP_RESERVE_PAGES,
pages_per_huge_page(h));
}
+ hugetlb_bootmem_init_migratetype(folio, h);
/* Subdivide locks to achieve better parallel performance */
spin_lock_irqsave(&hugetlb_lock, flags);
__prep_account_new_huge_page(h, folio_nid(folio));
@@ -3251,6 +3362,57 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
}
}
+bool __init hugetlb_bootmem_page_zones_valid(int nid,
+ struct huge_bootmem_page *m)
+{
+ unsigned long start_pfn;
+ bool valid;
+
+ if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
+ /*
+ * Already validated, skip check.
+ */
+ return true;
+ }
+
+ if (hugetlb_bootmem_page_earlycma(m)) {
+ valid = cma_validate_zones(m->cma);
+ goto out;
+ }
+
+ start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
+
+ valid = !pfn_range_intersects_zones(nid, start_pfn,
+ pages_per_huge_page(m->hstate));
+out:
+ if (!valid)
+ hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
+
+ return valid;
+}
+
+/*
+ * Free a bootmem page that was found to be invalid (intersecting with
+ * multiple zones).
+ *
+ * Since it intersects with multiple zones, we can't just do a free
+ * operation on all pages at once, but instead have to walk all
+ * pages, freeing them one by one.
+ */
+static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
+ struct hstate *h)
+{
+ unsigned long npages = pages_per_huge_page(h);
+ unsigned long pfn;
+
+ while (npages--) {
+ pfn = page_to_pfn(page);
+ __init_page_from_nid(pfn, nid);
+ free_reserved_page(page);
+ page++;
+ }
+}
+
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
@@ -3258,14 +3420,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
LIST_HEAD(folio_list);
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m, *tm;
struct hstate *h = NULL, *prev_h = NULL;
- list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
struct page *page = virt_to_page(m);
struct folio *folio = (void *)page;
h = m->hstate;
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Can't use this page. Initialize the
+ * page structures if that hasn't already
+ * been done, and give them to the page
+ * allocator.
+ */
+ hugetlb_bootmem_free_invalid_page(nid, page, h);
+ continue;
+ }
+
/*
* It is possible to have multiple huge page sizes (hstates)
* in this list. If so, process each size separately.
@@ -3280,14 +3453,30 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
hugetlb_folio_init_vmemmap(folio, h,
HUGETLB_VMEMMAP_RESERVE_PAGES);
init_new_hugetlb_folio(h, folio);
+
+ if (hugetlb_bootmem_page_prehvo(m))
+ /*
+ * If pre-HVO was done, just set the
+ * flag, the HVO code will then skip
+ * this folio.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
+
+ if (hugetlb_bootmem_page_earlycma(m))
+ folio_set_hugetlb_cma(folio);
+
list_add(&folio->lru, &folio_list);
/*
* We need to restore the 'stolen' pages to totalram_pages
* in order to fix confusing memory reports from free(1) and
* other side-effects, like CommitLimit going negative.
+ *
+ * For CMA pages, this is done in init_cma_pageblock
+ * (via hugetlb_bootmem_init_migratetype), so skip it here.
*/
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ if (!folio_test_hugetlb_cma(folio))
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
@@ -3427,32 +3616,44 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
.numa_aware = true
};
+ unsigned long jiffies_start;
+ unsigned long jiffies_end;
+
job.thread_fn = hugetlb_pages_alloc_boot_node;
job.start = 0;
job.size = h->max_huge_pages;
/*
- * job.max_threads is twice the num_node_state(N_MEMORY),
+ * job.max_threads is 25% of the available cpu threads by default.
*
- * Tests below indicate that a multiplier of 2 significantly improves
- * performance, and although larger values also provide improvements,
- * the gains are marginal.
+ * On large servers with terabytes of memory, huge page allocation
+ * can consume a considerably amount of time.
*
- * Therefore, choosing 2 as the multiplier strikes a good balance between
- * enhancing parallel processing capabilities and maintaining efficient
- * resource management.
+ * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
+ * 2MiB huge pages. Using more threads can significantly improve allocation time.
*
- * +------------+-------+-------+-------+-------+-------+
- * | multiplier | 1 | 2 | 3 | 4 | 5 |
- * +------------+-------+-------+-------+-------+-------+
- * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
- * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms |
- * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms |
- * +------------+-------+-------+-------+-------+-------+
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | threads | 8 | 16 | 32 | 64 | 128 |
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s |
+ * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s |
+ * +-----------------------+-------+-------+-------+-------+-------+
*/
- job.max_threads = num_node_state(N_MEMORY) * 2;
- job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+ if (hugepage_allocation_threads == 0) {
+ hugepage_allocation_threads = num_online_cpus() / 4;
+ hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
+ }
+
+ job.max_threads = hugepage_allocation_threads;
+ job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
+
+ jiffies_start = jiffies;
padata_do_multithreaded(&job);
+ jiffies_end = jiffies;
+
+ pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
+ jiffies_to_msecs(jiffies_end - jiffies_start),
+ hugepage_allocation_threads);
return h->nr_huge_pages;
}
@@ -3471,23 +3672,17 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long allocated;
- static bool initialized __initdata;
- /* skip gigantic hugepages allocation if hugetlb_cma enabled */
- if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ /*
+ * Skip gigantic hugepages allocation if early CMA
+ * reservations are not available.
+ */
+ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
+ !hugetlb_early_cma(h)) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
return;
}
- /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
- if (!initialized) {
- int i = 0;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- INIT_LIST_HEAD(&huge_boot_pages[i]);
- initialized = true;
- }
-
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -3520,7 +3715,7 @@ static void __init hugetlb_init_hstates(void)
*/
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
continue;
- if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
continue;
for_each_hstate(h2) {
if (h2 == h)
@@ -3535,13 +3730,20 @@ static void __init hugetlb_init_hstates(void)
static void __init report_hugepages(void)
{
struct hstate *h;
+ unsigned long nrinvalid;
for_each_hstate(h) {
char buf[32];
+ nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
+ h->max_huge_pages -= nrinvalid;
+
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
+ if (nrinvalid)
+ pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
+ buf, nrinvalid, nrinvalid > 1 ? "s" : "");
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -4415,14 +4617,6 @@ static void hugetlb_register_all_nodes(void) { }
#endif
-#ifdef CONFIG_CMA
-static void __init hugetlb_cma_check(void);
-#else
-static inline __init void hugetlb_cma_check(void)
-{
-}
-#endif
-
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
@@ -4547,8 +4741,6 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_memory_node;
- h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/SZ_1K);
@@ -4573,6 +4765,44 @@ static void __init hugepages_clear_pages_in_node(void)
}
}
+static __init int hugetlb_add_param(char *s, int (*setup)(char *))
+{
+ size_t len;
+ char *p;
+
+ if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
+ return -EINVAL;
+
+ len = strlen(s) + 1;
+ if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
+ return -EINVAL;
+
+ p = &hstate_cmdline_buf[hstate_cmdline_index];
+ memcpy(p, s, len);
+ hstate_cmdline_index += len;
+
+ hugetlb_params[hugetlb_param_index].val = p;
+ hugetlb_params[hugetlb_param_index].setup = setup;
+
+ hugetlb_param_index++;
+
+ return 0;
+}
+
+static __init void hugetlb_parse_params(void)
+{
+ int i;
+ struct hugetlb_cmdline *hcp;
+
+ for (i = 0; i < hugetlb_param_index; i++) {
+ hcp = &hugetlb_params[i];
+
+ hcp->setup(hcp->val);
+ }
+
+ hugetlb_cma_validate_params();
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4592,7 +4822,7 @@ static int __init hugepages_setup(char *s)
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return -EINVAL;
}
/*
@@ -4646,24 +4876,16 @@ static int __init hugepages_setup(char *s)
}
}
- /*
- * Global state is always initialized later in hugetlb_init.
- * But we need to allocate gigantic hstates here early to still
- * use the bootmem allocator.
- */
- if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
- hugetlb_hstate_alloc_pages(parsed_hstate);
-
last_mhp = mhp;
- return 1;
+ return 0;
invalid:
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
hugepages_clear_pages_in_node();
- return 1;
+ return -EINVAL;
}
-__setup("hugepages=", hugepages_setup);
+hugetlb_early_param("hugepages", hugepages_setup);
/*
* hugepagesz command line processing
@@ -4682,7 +4904,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
h = size_to_hstate(size);
@@ -4697,7 +4919,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 1;
+ return -EINVAL;
}
/*
@@ -4707,14 +4929,14 @@ static int __init hugepagesz_setup(char *s)
*/
parsed_hstate = h;
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
-__setup("hugepagesz=", hugepagesz_setup);
+hugetlb_early_param("hugepagesz", hugepagesz_setup);
/*
* default_hugepagesz command line input
@@ -4728,14 +4950,14 @@ static int __init default_hugepagesz_setup(char *s)
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 1;
+ return -EINVAL;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4752,17 +4974,74 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- for_each_online_node(i)
- default_hstate.max_huge_pages_node[i] =
- default_hugepages_in_node[i];
- if (hstate_is_gigantic(&default_hstate))
- hugetlb_hstate_alloc_pages(&default_hstate);
+ /*
+ * Since this is an early parameter, we can't check
+ * NUMA node state yet, so loop through MAX_NUMNODES.
+ */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (default_hugepages_in_node[i] != 0)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
+ }
default_hstate_max_huge_pages = 0;
}
+ return 0;
+}
+hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+
+static bool __hugetlb_bootmem_allocated __initdata;
+
+bool __init hugetlb_bootmem_allocated(void)
+{
+ return __hugetlb_bootmem_allocated;
+}
+
+void __init hugetlb_bootmem_alloc(void)
+{
+ struct hstate *h;
+ int i;
+
+ if (__hugetlb_bootmem_allocated)
+ return;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&huge_boot_pages[i]);
+
+ hugetlb_parse_params();
+
+ for_each_hstate(h) {
+ h->next_nid_to_alloc = first_online_node;
+ h->next_nid_to_free = first_online_node;
+
+ if (hstate_is_gigantic(h))
+ hugetlb_hstate_alloc_pages(h);
+ }
+
+ __hugetlb_bootmem_allocated = true;
+}
+
+/*
+ * hugepage_alloc_threads command line parsing.
+ *
+ * When set, use this specific number of threads for the boot
+ * allocation of hugepages.
+ */
+static int __init hugepage_alloc_threads_setup(char *s)
+{
+ unsigned long allocation_threads;
+
+ if (kstrtoul(s, 0, &allocation_threads) != 0)
+ return 1;
+
+ if (allocation_threads == 0)
+ return 1;
+
+ hugepage_allocation_threads = allocation_threads;
+
return 1;
}
-__setup("default_hugepagesz=", default_hugepagesz_setup);
+__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);
static unsigned int allowed_mems_nr(struct hstate *h)
{
@@ -4900,7 +5179,7 @@ static const struct ctl_table hugetlb_table[] = {
},
};
-static void hugetlb_sysctl_init(void)
+static void __init hugetlb_sysctl_init(void)
{
register_sysctl_init("vm", hugetlb_table);
}
@@ -5447,7 +5726,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
@@ -5622,7 +5901,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
- pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -7613,163 +7892,3 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
}
-
-#ifdef CONFIG_CMA
-static bool cma_reserve_called __initdata;
-
-static int __init cmdline_parse_hugetlb_cma(char *p)
-{
- int nid, count = 0;
- unsigned long tmp;
- char *s = p;
-
- while (*s) {
- if (sscanf(s, "%lu%n", &tmp, &count) != 1)
- break;
-
- if (s[count] == ':') {
- if (tmp >= MAX_NUMNODES)
- break;
- nid = array_index_nospec(tmp, MAX_NUMNODES);
-
- s += count + 1;
- tmp = memparse(s, &s);
- hugetlb_cma_size_in_node[nid] = tmp;
- hugetlb_cma_size += tmp;
-
- /*
- * Skip the separator if have one, otherwise
- * break the parsing.
- */
- if (*s == ',')
- s++;
- else
- break;
- } else {
- hugetlb_cma_size = memparse(p, &p);
- break;
- }
- }
-
- return 0;
-}
-
-early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
-
-void __init hugetlb_cma_reserve(int order)
-{
- unsigned long size, reserved, per_node;
- bool node_specific_cma_alloc = false;
- int nid;
-
- /*
- * HugeTLB CMA reservation is required for gigantic
- * huge pages which could not be allocated via the
- * page allocator. Just warn if there is any change
- * breaking this assumption.
- */
- VM_WARN_ON(order <= MAX_PAGE_ORDER);
- cma_reserve_called = true;
-
- if (!hugetlb_cma_size)
- return;
-
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- if (!node_online(nid)) {
- pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- continue;
- }
-
- if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
- nid, (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- } else {
- node_specific_cma_alloc = true;
- }
- }
-
- /* Validate the CMA size again in case some invalid nodes specified. */
- if (!hugetlb_cma_size)
- return;
-
- if (hugetlb_cma_size < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
- (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size = 0;
- return;
- }
-
- if (!node_specific_cma_alloc) {
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
- }
-
- reserved = 0;
- for_each_online_node(nid) {
- int res;
- char name[CMA_MAX_NAME];
-
- if (node_specific_cma_alloc) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- size = hugetlb_cma_size_in_node[nid];
- } else {
- size = min(per_node, hugetlb_cma_size - reserved);
- }
-
- size = round_up(size, PAGE_SIZE << order);
-
- snprintf(name, sizeof(name), "hugetlb%d", nid);
- /*
- * Note that 'order per bit' is based on smallest size that
- * may be returned to CMA allocator in the case of
- * huge page demotion.
- */
- res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << order,
- HUGETLB_PAGE_ORDER, false, name,
- &hugetlb_cma[nid], nid);
- if (res) {
- pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
- res, nid);
- continue;
- }
-
- reserved += size;
- pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
- size / SZ_1M, nid);
-
- if (reserved >= hugetlb_cma_size)
- break;
- }
-
- if (!reserved)
- /*
- * hugetlb_cma_size is used to determine if allocations from
- * cma are possible. Set to zero if no cma regions are set up.
- */
- hugetlb_cma_size = 0;
-}
-
-static void __init hugetlb_cma_check(void)
-{
- if (!hugetlb_cma_size || cma_reserve_called)
- return;
-
- pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
-}
-
-#endif /* CONFIG_CMA */
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bb9578bd99f9..58e895f3899a 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -101,10 +101,9 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
- struct page_counter *fault_parent = NULL;
- struct page_counter *rsvd_parent = NULL;
+ struct page_counter *fault, *fault_parent = NULL;
+ struct page_counter *rsvd, *rsvd_parent = NULL;
unsigned long limit;
- int ret;
if (parent_h_cgroup) {
fault_parent = hugetlb_cgroup_counter_from_cgroup(
@@ -112,24 +111,22 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
parent_h_cgroup, idx);
}
- page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
- idx),
- fault_parent, false);
- page_counter_init(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- rsvd_parent, false);
+ fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
+ rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
+
+ page_counter_init(fault, fault_parent, false);
+ page_counter_init(rsvd, rsvd_parent, false);
+
+ if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
+ fault->track_failcnt = true;
+ rsvd->track_failcnt = true;
+ }
limit = round_down(PAGE_COUNTER_MAX,
pages_per_huge_page(&hstates[idx]));
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
- limit);
- VM_BUG_ON(ret);
- ret = page_counter_set_max(
- hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- limit);
- VM_BUG_ON(ret);
+ VM_BUG_ON(page_counter_set_max(fault, limit));
+ VM_BUG_ON(page_counter_set_max(rsvd, limit));
}
}
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
new file mode 100644
index 000000000000..e0f2d5c3a84c
--- /dev/null
+++ b/mm/hugetlb_cma.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/cma.h>
+#include <linux/compiler.h>
+#include <linux/mm_inline.h>
+
+#include <asm/page.h>
+#include <asm/setup.h>
+
+#include <linux/hugetlb.h>
+#include "internal.h"
+#include "hugetlb_cma.h"
+
+
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_only;
+static unsigned long hugetlb_cma_size __initdata;
+
+void hugetlb_cma_free_folio(struct folio *folio)
+{
+ int nid = folio_nid(folio);
+
+ WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio));
+}
+
+
+struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ int node;
+ int order = huge_page_order(h);
+ struct folio *folio = NULL;
+
+ if (hugetlb_cma[nid])
+ folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
+
+ if (!folio && !(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
+
+ folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
+ if (folio)
+ break;
+ }
+ }
+
+ if (folio)
+ folio_set_hugetlb_cma(folio);
+
+ return folio;
+}
+
+struct huge_bootmem_page * __init
+hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
+{
+ struct cma *cma;
+ struct huge_bootmem_page *m;
+ int node = *nid;
+
+ cma = hugetlb_cma[*nid];
+ m = cma_reserve_early(cma, huge_page_size(h));
+ if (!m) {
+ if (node_exact)
+ return NULL;
+
+ for_each_online_node(node) {
+ cma = hugetlb_cma[node];
+ if (!cma || node == *nid)
+ continue;
+ m = cma_reserve_early(cma, huge_page_size(h));
+ if (m) {
+ *nid = node;
+ break;
+ }
+ }
+ }
+
+ if (m) {
+ m->flags = HUGE_BOOTMEM_CMA;
+ m->cma = cma;
+ }
+
+ return m;
+}
+
+
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ hugetlb_cma_size_in_node[nid] = tmp;
+ hugetlb_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ hugetlb_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+static int __init cmdline_parse_hugetlb_cma_only(char *p)
+{
+ return kstrtobool(p, &hugetlb_cma_only);
+}
+
+early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only);
+
+void __init hugetlb_cma_reserve(int order)
+{
+ unsigned long size, reserved, per_node;
+ bool node_specific_cma_alloc = false;
+ int nid;
+
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ VM_WARN_ON(order <= MAX_PAGE_ORDER);
+ cma_reserve_called = true;
+
+ if (!hugetlb_cma_size)
+ return;
+
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_online(nid)) {
+ pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+ nid, (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!hugetlb_cma_size)
+ return;
+
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+ (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size = 0;
+ return;
+ }
+
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ }
+
+ reserved = 0;
+ for_each_online_node(nid) {
+ int res;
+ char name[CMA_MAX_NAME];
+
+ if (node_specific_cma_alloc) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = hugetlb_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, hugetlb_cma_size - reserved);
+ }
+
+ size = round_up(size, PAGE_SIZE << order);
+
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_multi(size, PAGE_SIZE << order,
+ HUGETLB_PAGE_ORDER, name,
+ &hugetlb_cma[nid], nid);
+ if (res) {
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= hugetlb_cma_size)
+ break;
+ }
+
+ if (!reserved)
+ /*
+ * hugetlb_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ hugetlb_cma_size = 0;
+}
+
+void __init hugetlb_cma_check(void)
+{
+ if (!hugetlb_cma_size || cma_reserve_called)
+ return;
+
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+bool hugetlb_cma_exclusive_alloc(void)
+{
+ return hugetlb_cma_only;
+}
+
+unsigned long __init hugetlb_cma_total_size(void)
+{
+ return hugetlb_cma_size;
+}
+
+void __init hugetlb_cma_validate_params(void)
+{
+ if (!hugetlb_cma_size)
+ hugetlb_cma_only = false;
+}
+
+bool __init hugetlb_early_cma(struct hstate *h)
+{
+ if (arch_has_huge_bootmem_alloc())
+ return false;
+
+ return hstate_is_gigantic(h) && hugetlb_cma_only;
+}
diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h
new file mode 100644
index 000000000000..f7d7fb9880a2
--- /dev/null
+++ b/mm/hugetlb_cma.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HUGETLB_CMA_H
+#define _LINUX_HUGETLB_CMA_H
+
+#ifdef CONFIG_CMA
+void hugetlb_cma_free_folio(struct folio *folio);
+struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask);
+struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
+ bool node_exact);
+void hugetlb_cma_check(void);
+bool hugetlb_cma_exclusive_alloc(void);
+unsigned long hugetlb_cma_total_size(void);
+void hugetlb_cma_validate_params(void);
+bool hugetlb_early_cma(struct hstate *h);
+#else
+static inline void hugetlb_cma_free_folio(struct folio *folio)
+{
+}
+
+static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+
+static inline
+struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
+ bool node_exact)
+{
+ return NULL;
+}
+
+static inline void hugetlb_cma_check(void)
+{
+}
+
+static inline bool hugetlb_cma_exclusive_alloc(void)
+{
+ return false;
+}
+
+static inline unsigned long hugetlb_cma_total_size(void)
+{
+ return 0;
+}
+
+static inline void hugetlb_cma_validate_params(void)
+{
+}
+
+static inline bool hugetlb_early_cma(struct hstate *h)
+{
+ return false;
+}
+#endif
+#endif
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 7735972add01..9a99dfa3c495 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -444,7 +444,11 @@ DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
-core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
+static int __init hugetlb_vmemmap_optimize_param(char *buf)
+{
+ return kstrtobool(buf, &vmemmap_optimize_enabled);
+}
+early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
struct folio *folio, unsigned long flags)
@@ -645,14 +649,39 @@ static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *fol
return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
}
-void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
+ struct list_head *folio_list,
+ bool boot)
{
struct folio *folio;
+ int nr_to_optimize;
LIST_HEAD(vmemmap_pages);
unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
+ nr_to_optimize = 0;
list_for_each_entry(folio, folio_list, lru) {
- int ret = hugetlb_vmemmap_split_folio(h, folio);
+ int ret;
+ unsigned long spfn, epfn;
+
+ if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
+ /*
+ * Already optimized by pre-HVO, just map the
+ * mirrored tail page structs RO.
+ */
+ spfn = (unsigned long)&folio->page;
+ epfn = spfn + pages_per_huge_page(h);
+ vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ register_page_bootmem_memmap(pfn_to_section_nr(spfn),
+ &folio->page,
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ continue;
+ }
+
+ nr_to_optimize++;
+
+ ret = hugetlb_vmemmap_split_folio(h, folio);
/*
* Spliting the PMD requires allocating a page, thus lets fail
@@ -664,6 +693,16 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
break;
}
+ if (!nr_to_optimize)
+ /*
+ * All pre-HVO folios, nothing left to do. It's ok if
+ * there is a mix of pre-HVO and not yet HVO-ed folios
+ * here, as __hugetlb_vmemmap_optimize_folio() will
+ * skip any folios that already have the optimized flag
+ * set, see vmemmap_should_optimize_folio().
+ */
+ goto out;
+
flush_tlb_all();
list_for_each_entry(folio, folio_list, lru) {
@@ -689,10 +728,164 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
}
}
+out:
flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages);
}
+void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
+{
+ __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
+}
+
+void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
+{
+ __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+
+/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
+static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
+{
+ unsigned long section_size, psize, pmd_vmemmap_size;
+ phys_addr_t paddr;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return false;
+
+ if (!hugetlb_vmemmap_optimizable(m->hstate))
+ return false;
+
+ psize = huge_page_size(m->hstate);
+ paddr = virt_to_phys(m);
+
+ /*
+ * Pre-HVO only works if the bootmem huge page
+ * is aligned to the section size.
+ */
+ section_size = (1UL << PA_SECTION_SHIFT);
+ if (!IS_ALIGNED(paddr, section_size) ||
+ !IS_ALIGNED(psize, section_size))
+ return false;
+
+ /*
+ * The pre-HVO code does not deal with splitting PMDS,
+ * so the bootmem page must be aligned to the number
+ * of base pages that can be mapped with one vmemmap PMD.
+ */
+ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
+ if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
+ !IS_ALIGNED(psize, pmd_vmemmap_size))
+ return false;
+
+ return true;
+}
+
+/*
+ * Initialize memmap section for a gigantic page, HVO-style.
+ */
+void __init hugetlb_vmemmap_init_early(int nid)
+{
+ unsigned long psize, paddr, section_size;
+ unsigned long ns, i, pnum, pfn, nr_pages;
+ unsigned long start, end;
+ struct huge_bootmem_page *m = NULL;
+ void *map;
+
+ /*
+ * Noting to do if bootmem pages were not allocated
+ * early in boot, or if HVO wasn't enabled in the
+ * first place.
+ */
+ if (!hugetlb_bootmem_allocated())
+ return;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return;
+
+ section_size = (1UL << PA_SECTION_SHIFT);
+
+ list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ if (!vmemmap_should_optimize_bootmem_page(m))
+ continue;
+
+ nr_pages = pages_per_huge_page(m->hstate);
+ psize = nr_pages << PAGE_SHIFT;
+ paddr = virt_to_phys(m);
+ pfn = PHYS_PFN(paddr);
+ map = pfn_to_page(pfn);
+ start = (unsigned long)map;
+ end = start + nr_pages * sizeof(struct page);
+
+ if (vmemmap_populate_hvo(start, end, nid,
+ HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
+ continue;
+
+ memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
+
+ pnum = pfn_to_section_nr(pfn);
+ ns = psize / section_size;
+
+ for (i = 0; i < ns; i++) {
+ sparse_init_early_section(nid, map, pnum,
+ SECTION_IS_VMEMMAP_PREINIT);
+ map += section_map_size();
+ pnum++;
+ }
+
+ m->flags |= HUGE_BOOTMEM_HVO;
+ }
+}
+
+void __init hugetlb_vmemmap_init_late(int nid)
+{
+ struct huge_bootmem_page *m, *tm;
+ unsigned long phys, nr_pages, start, end;
+ unsigned long pfn, nr_mmap;
+ struct hstate *h;
+ void *map;
+
+ if (!hugetlb_bootmem_allocated())
+ return;
+
+ if (!READ_ONCE(vmemmap_optimize_enabled))
+ return;
+
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
+ if (!(m->flags & HUGE_BOOTMEM_HVO))
+ continue;
+
+ phys = virt_to_phys(m);
+ h = m->hstate;
+ pfn = PHYS_PFN(phys);
+ nr_pages = pages_per_huge_page(h);
+
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Oops, the hugetlb page spans multiple zones.
+ * Remove it from the list, and undo HVO.
+ */
+ list_del(&m->list);
+
+ map = pfn_to_page(pfn);
+
+ start = (unsigned long)map;
+ end = start + nr_pages * sizeof(struct page);
+
+ vmemmap_undo_hvo(start, end, nid,
+ HUGETLB_VMEMMAP_RESERVE_SIZE);
+ nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
+ memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
+
+ memblock_phys_free(phys, huge_page_size(h));
+ continue;
+ } else
+ m->flags |= HUGE_BOOTMEM_ZONES_VALID;
+ }
+}
+#endif
+
static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
{
.procname = "hugetlb_optimize_vmemmap",
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 2fcae92d3359..18b490825215 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -9,6 +9,8 @@
#ifndef _LINUX_HUGETLB_VMEMMAP_H
#define _LINUX_HUGETLB_VMEMMAP_H
#include <linux/hugetlb.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
/*
* Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
@@ -24,6 +26,12 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct list_head *non_hvo_folios);
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
+void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+void hugetlb_vmemmap_init_early(int nid);
+void hugetlb_vmemmap_init_late(int nid);
+#endif
+
static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
{
@@ -48,7 +56,7 @@ static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct f
return 0;
}
-static long hugetlb_vmemmap_restore_folios(const struct hstate *h,
+static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct list_head *folio_list,
struct list_head *non_hvo_folios)
{
@@ -64,6 +72,19 @@ static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list
{
}
+static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+}
+
+static inline void hugetlb_vmemmap_init_early(int nid)
+{
+}
+
+static inline void hugetlb_vmemmap_init_late(int nid)
+{
+}
+
static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
{
return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 6af3ad675930..4600e7605cab 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,6 +40,7 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
+ .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
diff --git a/mm/internal.h b/mm/internal.h
index 109ef30fee11..50c2f590b2d0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -25,6 +25,47 @@
struct folio_batch;
/*
+ * Maintains state across a page table move. The operation assumes both source
+ * and destination VMAs already exist and are specified by the user.
+ *
+ * Partial moves are permitted, but the old and new ranges must both reside
+ * within a VMA.
+ *
+ * mmap lock must be held in write and VMA write locks must be held on any VMA
+ * that is visible.
+ *
+ * Use the PAGETABLE_MOVE() macro to initialise this struct.
+ *
+ * The old_addr and new_addr fields are updated as the page table move is
+ * executed.
+ *
+ * NOTE: The page table move is affected by reading from [old_addr, old_end),
+ * and old_addr may be updated for better page table alignment, so len_in
+ * represents the length of the range being copied as specified by the user.
+ */
+struct pagetable_move_control {
+ struct vm_area_struct *old; /* Source VMA. */
+ struct vm_area_struct *new; /* Destination VMA. */
+ unsigned long old_addr; /* Address from which the move begins. */
+ unsigned long old_end; /* Exclusive address at which old range ends. */
+ unsigned long new_addr; /* Address to move page tables to. */
+ unsigned long len_in; /* Bytes to remap specified by user. */
+
+ bool need_rmap_locks; /* Do rmap locks need to be taken? */
+ bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
+ struct pagetable_move_control name = { \
+ .old = old_, \
+ .new = new_, \
+ .old_addr = old_addr_, \
+ .old_end = (old_addr_) + (len_), \
+ .new_addr = new_addr_, \
+ .len_in = len_, \
+ }
+
+/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
* about IO, FS and watermark checking while ignoring placement
@@ -84,6 +125,8 @@ void page_writeback_init(void);
*/
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ return -1;
return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}
@@ -493,6 +536,7 @@ extern char * const zone_names[MAX_NR_ZONES];
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
extern int min_free_kbytes;
+extern int defrag_mode;
void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
@@ -658,6 +702,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
void set_zone_contiguous(struct zone *zone);
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages);
static inline void clear_zone_contiguous(struct zone *zone)
{
@@ -682,8 +728,8 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
return;
folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
-#ifdef CONFIG_64BIT
- folio->_folio_nr_pages = 1U << order;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 1U << order;
#endif
}
@@ -719,9 +765,17 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
folio_set_order(folio, order);
atomic_set(&folio->_large_mapcount, -1);
- atomic_set(&folio->_entire_mapcount, -1);
- atomic_set(&folio->_nr_pages_mapped, 0);
- atomic_set(&folio->_pincount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ folio->_mm_ids = 0;
+ folio->_mm_id_mapcount[0] = -1;
+ folio->_mm_id_mapcount[1] = -1;
+ }
+ if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
+ atomic_set(&folio->_pincount, 0);
+ atomic_set(&folio->_entire_mapcount, -1);
+ }
if (order > 1)
INIT_LIST_HEAD(&folio->_deferred_list);
}
@@ -735,8 +789,6 @@ static inline void prep_compound_tail(struct page *head, int tail_idx)
set_page_private(p, 0);
}
-extern void prep_compound_page(struct page *page, unsigned int order);
-
void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);
@@ -846,8 +898,24 @@ void init_cma_reserved_pageblock(struct page *page);
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+struct cma;
+
+#ifdef CONFIG_CMA
+void *cma_reserve_early(struct cma *cma, unsigned long size);
+void init_cma_pageblock(struct page *page);
+#else
+static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
+{
+ return NULL;
+}
+static inline void init_cma_pageblock(struct page *page)
+{
+}
+#endif
+
+
int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal);
+ int migratetype, bool claim_only, bool *claim_block);
static inline bool free_area_empty(struct free_area *area, int migratetype)
{
@@ -1097,9 +1165,13 @@ static inline void mminit_verify_zonelist(void)
#define NODE_RECLAIM_SUCCESS 1
#ifdef CONFIG_NUMA
+extern int node_reclaim_mode;
+
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
+#define node_reclaim_mode 0
+
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
@@ -1111,11 +1183,17 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
}
#endif
+static inline bool node_reclaim_enabled(void)
+{
+ /* Is any node_reclaim_mode bit set? */
+ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
/*
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1138,8 +1216,9 @@ unsigned long page_mapped_in_vma(const struct page *page,
struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
@@ -1187,6 +1266,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
/* Flags that allow allocations below the min watermark. */
@@ -1407,7 +1487,8 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
}
extern bool mirrored_kernelcore;
-extern bool memblock_has_mirror(void);
+bool memblock_has_mirror(void);
+void memblock_free_all(void);
static __always_inline void vma_set_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
@@ -1448,6 +1529,7 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte
void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid);
+void __meminit __init_page_from_nid(unsigned long pfn, int nid);
/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
@@ -1509,10 +1591,7 @@ extern struct list_lru shadow_nodes;
} while (0)
/* mremap.c */
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack);
+unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3e049dfb28bd..c36dd9f62fd5 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -50,9 +50,9 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
#ifndef ioremap_prot
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
+ pgprot_t prot)
{
- return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
+ return generic_ioremap_prot(phys_addr, size, prot);
}
EXPORT_SYMBOL(ioremap_prot);
#endif
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 59d673400085..3ea317837c2d 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1073,14 +1073,11 @@ static void kmem_cache_rcu_uaf(struct kunit *test)
kmem_cache_destroy(cache);
}
-static void empty_cache_ctor(void *object) { }
-
static void kmem_cache_double_destroy(struct kunit *test)
{
struct kmem_cache *cache;
- /* Provide a constructor to prevent cache merging. */
- cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ cache = kmem_cache_create("test_cache", 200, 0, SLAB_NO_MERGE, NULL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
kmem_cache_destroy(cache);
KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3fe77a360f1c..8357e1a33699 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+/*
+ * This function is invoked with report_lock (a raw_spinlock) held. A
+ * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
+ * rt_spinlock.
+ *
+ * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
+ * lockdep warning for this raw_spinlock -> spinlock dependency. This config
+ * option is enabled by default to ensure better test coverage to expose this
+ * kind of RT kernel problem. This lockdep splat, however, can be suppressed
+ * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
+ * invalid PREEMPT_RT case has been taken care of.
+ */
+static inline struct vm_struct *kasan_find_vm_area(void *addr)
+{
+ static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
+ struct vm_struct *va;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return NULL;
+
+ /*
+ * Suppress lockdep warning and fetch vmalloc area of the
+ * offending address.
+ */
+ lock_map_acquire_try(&vmalloc_map);
+ va = find_vm_area(addr);
+ lock_map_release(&vmalloc_map);
+ return va;
+}
+
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = find_vm_area(addr);
+ struct vm_struct *va = kasan_find_vm_area(addr);
if (va) {
pr_err("The buggy address belongs to the virtual mapping at\n"
@@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag,
pr_err("\n");
page = vmalloc_to_page(addr);
+ } else {
+ pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
}
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5f0be134141e..cc945c6ab3bd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -607,7 +607,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
/* See hpage_collapse_scan_pmd(). */
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -1359,11 +1359,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
/*
* We treat a single page as shared if any part of the THP
- * is shared. "False negatives" from
- * folio_likely_mapped_shared() are not expected to matter
- * much in practice.
+ * is shared.
*/
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c6ed68604136..c12cef3eeb32 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -352,6 +352,15 @@ static bool unreferenced_object(struct kmemleak_object *object)
jiffies_last_scan);
}
+static const char *__object_type_str(struct kmemleak_object *object)
+{
+ if (object->flags & OBJECT_PHYS)
+ return " (phys)";
+ if (object->flags & OBJECT_PERCPU)
+ return " (percpu)";
+ return "";
+}
+
/*
* Printing of the unreferenced objects information to the seq file. The
* print_unreferenced function must be called with the object->lock held.
@@ -364,8 +373,9 @@ static void print_unreferenced(struct seq_file *seq,
unsigned int nr_entries;
nr_entries = stack_depot_fetch(object->trace_handle, &entries);
- warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ warn_or_seq_printf(seq, "unreferenced object%s 0x%08lx (size %zu):\n",
+ __object_type_str(object),
+ object->pointer, object->size);
warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
hex_dump_object(seq, object);
@@ -384,10 +394,10 @@ static void print_unreferenced(struct seq_file *seq,
*/
static void dump_object_info(struct kmemleak_object *object)
{
- pr_notice("Object 0x%08lx (size %zu):\n",
- object->pointer, object->size);
+ pr_notice("Object%s 0x%08lx (size %zu):\n",
+ __object_type_str(object), object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
- object->comm, object->pid, object->jiffies);
+ object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%x\n", object->flags);
@@ -1998,25 +2008,41 @@ static int kmemleak_open(struct inode *inode, struct file *file)
return seq_open(file, &kmemleak_seq_ops);
}
-static int dump_str_object_info(const char *str)
+static bool __dump_str_object_info(unsigned long addr, unsigned int objflags)
{
unsigned long flags;
struct kmemleak_object *object;
+
+ object = __find_and_get_object(addr, 1, objflags);
+ if (!object)
+ return false;
+
+ raw_spin_lock_irqsave(&object->lock, flags);
+ dump_object_info(object);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+
+ return true;
+}
+
+static int dump_str_object_info(const char *str)
+{
unsigned long addr;
+ bool found = false;
if (kstrtoul(str, 0, &addr))
return -EINVAL;
- object = find_and_get_object(addr, 0);
- if (!object) {
+
+ found |= __dump_str_object_info(addr, 0);
+ found |= __dump_str_object_info(addr, OBJECT_PHYS);
+ found |= __dump_str_object_info(addr, OBJECT_PERCPU);
+
+ if (!found) {
pr_info("Unknown object at 0x%08lx\n", addr);
return -EINVAL;
}
- raw_spin_lock_irqsave(&object->lock, flags);
- dump_object_info(object);
- raw_spin_unlock_irqrestore(&object->lock, flags);
-
- put_object(object);
return 0;
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3ea50f09311f..3df45c25c1f6 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
+EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)
diff --git a/mm/ksm.c b/mm/ksm.c
index 8be2b144fefd..8583fb91ef13 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1270,8 +1270,15 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
- anon_exclusive = PageAnonExclusive(&folio->page);
entry = ptep_get(pvmw.pte);
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that actually
+ * map pages: give up just like the next folio_walk would.
+ */
+ if (unlikely(!pte_present(entry)))
+ goto out_unlock;
+
+ anon_exclusive = PageAnonExclusive(&folio->page);
if (pte_write(entry) || pte_dirty(entry) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
swapped = folio_test_swapcache(folio);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7d69434c70e0..490473af3122 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -510,7 +510,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
gfp_t gfp)
{
unsigned long flags;
- struct list_lru_memcg *mlru;
+ struct list_lru_memcg *mlru = NULL;
struct mem_cgroup *pos, *parent;
XA_STATE(xas, &lru->xa, 0);
@@ -535,9 +535,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
parent = parent_mem_cgroup(pos);
}
- mlru = memcg_init_list_lru_one(lru, gfp);
- if (!mlru)
- return -ENOMEM;
+ if (!mlru) {
+ mlru = memcg_init_list_lru_one(lru, gfp);
+ if (!mlru)
+ return -ENOMEM;
+ }
xas_set(&xas, pos->kmemcg_id);
do {
xas_lock_irqsave(&xas, flags);
@@ -548,10 +550,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
}
xas_unlock_irqrestore(&xas, flags);
} while (xas_nomem(&xas, gfp));
- if (mlru)
- kfree(mlru);
} while (pos != memcg && !css_is_dying(&pos->css));
+ if (unlikely(mlru))
+ kfree(mlru);
+
return xas_error(&xas);
}
#else
diff --git a/mm/madvise.c b/mm/madvise.c
index 49f3a75046f6..b17f684322ad 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -387,7 +387,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -486,7 +486,7 @@ restart:
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
@@ -721,7 +721,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (!folio_trylock(folio))
continue;
@@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
@@ -1042,13 +1051,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
if (!allow_locked)
disallowed |= VM_LOCKED;
- if (!vma_is_anonymous(vma))
- return false;
-
- if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
- return false;
-
- return true;
+ return !(vma->vm_flags & disallowed);
}
static bool is_guard_pte_marker(pte_t ptent)
@@ -1389,7 +1392,32 @@ static int madvise_inject_error(int behavior,
return 0;
}
-#endif
+
+static bool is_memory_failure(int behavior)
+{
+ switch (behavior) {
+ case MADV_HWPOISON:
+ case MADV_SOFT_OFFLINE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#else
+
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
+{
+ return 0;
+}
+
+static bool is_memory_failure(int behavior)
+{
+ return false;
+}
+
+#endif /* CONFIG_MEMORY_FAILURE */
static bool
madvise_behavior_valid(int behavior)
@@ -1565,6 +1593,111 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
+
+static int madvise_lock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return 0;
+
+ if (madvise_need_mmap_write(behavior)) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ } else {
+ mmap_read_lock(mm);
+ }
+ return 0;
+}
+
+static void madvise_unlock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return;
+
+ if (madvise_need_mmap_write(behavior))
+ mmap_write_unlock(mm);
+ else
+ mmap_read_unlock(mm);
+}
+
+static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+{
+ size_t len;
+
+ if (!madvise_behavior_valid(behavior))
+ return false;
+
+ if (!PAGE_ALIGNED(start))
+ return false;
+ len = PAGE_ALIGN(len_in);
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return false;
+
+ if (start + len < start)
+ return false;
+
+ return true;
+}
+
+/*
+ * madvise_should_skip() - Return if the request is invalid or nothing.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ * @behavior: Requested madvise behavor.
+ * @err: Pointer to store an error code from the check.
+ *
+ * If the specified behaviour is invalid or nothing would occur, we skip the
+ * operation. This function returns true in the cases, otherwise false. In
+ * the former case we store an error on @err.
+ */
+static bool madvise_should_skip(unsigned long start, size_t len_in,
+ int behavior, int *err)
+{
+ if (!is_valid_madvise(start, len_in, behavior)) {
+ *err = -EINVAL;
+ return true;
+ }
+ if (start + PAGE_ALIGN(len_in) == start) {
+ *err = 0;
+ return true;
+ }
+ return false;
+}
+
+static bool is_madvise_populate(int behavior)
+{
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int madvise_do_behavior(struct mm_struct *mm,
+ unsigned long start, size_t len_in, int behavior)
+{
+ struct blk_plug plug;
+ unsigned long end;
+ int error;
+
+ if (is_memory_failure(behavior))
+ return madvise_inject_error(behavior, start, start + len_in);
+ start = untagged_addr_remote(mm, start);
+ end = start + PAGE_ALIGN(len_in);
+
+ blk_start_plug(&plug);
+ if (is_madvise_populate(behavior))
+ error = madvise_populate(mm, start, end, behavior);
+ else
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
+ blk_finish_plug(&plug);
+ return error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1639,63 +1772,15 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
int error;
- int write;
- size_t len;
- struct blk_plug plug;
-
- if (!madvise_behavior_valid(behavior))
- return -EINVAL;
-
- if (!PAGE_ALIGNED(start))
- return -EINVAL;
- len = PAGE_ALIGN(len_in);
-
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
- end = start + len;
- if (end < start)
- return -EINVAL;
-
- if (end == start)
- return 0;
-
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_inject_error(behavior, start, start + len_in);
-#endif
-
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
- start = untagged_addr_remote(mm, start);
- end = start + len;
-
- blk_start_plug(&plug);
- switch (behavior) {
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- error = madvise_populate(mm, start, end, behavior);
- break;
- default:
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
- break;
- }
- blk_finish_plug(&plug);
-
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ return error;
+ error = madvise_lock(mm, behavior);
+ if (error)
+ return error;
+ error = madvise_do_behavior(mm, start, len_in, behavior);
+ madvise_unlock(mm, behavior);
return error;
}
@@ -1714,16 +1799,26 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
total_len = iov_iter_count(iter);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ return ret;
+
while (iov_iter_count(iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
- iter_iov_len(iter), behavior);
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
+ size_t len_in = iter_iov_len(iter);
+ int error;
+
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ ret = error;
+ else
+ ret = madvise_do_behavior(mm, start, len_in, behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
* the operation in aggregate, and would be surprising to the
* user.
*
- * As we have already dropped locks, it is safe to just loop and
+ * We drop and reacquire locks so it is safe to just loop and
* try again. We check for fatal signals in case we need exit
* early anyway.
*/
@@ -1732,12 +1827,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
ret = -EINTR;
break;
}
+
+ /* Drop and reacquire lock to unwind race. */
+ madvise_unlock(mm, behavior);
+ madvise_lock(mm, behavior);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
+ madvise_unlock(mm, behavior);
ret = (total_len - iov_iter_count(iter)) ? : ret;
diff --git a/mm/memblock.c b/mm/memblock.c
index 95af35fd1389..0a53db4d9f7b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -16,6 +16,7 @@
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/mutex.h>
#include <asm/sections.h>
#include <linux/io.h>
@@ -2164,8 +2165,10 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
phys_addr_t end)
{
unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn)
+ end_pfn = max_low_pfn;
if (start_pfn >= end_pfn)
return 0;
@@ -2283,6 +2286,7 @@ struct reserve_mem_table {
};
static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
static int reserved_mem_count;
+static DEFINE_MUTEX(reserve_mem_lock);
/* Add wildcard region with a lookup name */
static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
@@ -2296,6 +2300,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
strscpy(map->name, name);
}
+static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name)
+{
+ struct reserve_mem_table *map;
+ int i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ map = &reserved_mem_table[i];
+ if (!map->size)
+ continue;
+ if (strcmp(name, map->name) == 0)
+ return map;
+ }
+ return NULL;
+}
+
/**
* reserve_mem_find_by_name - Find reserved memory region with a given name
* @name: The name that is attached to a reserved memory region
@@ -2309,22 +2328,47 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size)
{
struct reserve_mem_table *map;
- int i;
- for (i = 0; i < reserved_mem_count; i++) {
- map = &reserved_mem_table[i];
- if (!map->size)
- continue;
- if (strcmp(name, map->name) == 0) {
- *start = map->start;
- *size = map->size;
- return 1;
- }
- }
- return 0;
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ *start = map->start;
+ *size = map->size;
+ return 1;
}
EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
+/**
+ * reserve_mem_release_by_name - Release reserved memory region with a given name
+ * @name: The name that is attatched to a reserved memory region
+ *
+ * Forcibly release the pages in the reserved memory region so that those memory
+ * can be used as free memory. After released the reserved region size becomes 0.
+ *
+ * Returns: 1 if released or 0 if not found.
+ */
+int reserve_mem_release_by_name(const char *name)
+{
+ char buf[RESERVE_MEM_NAME_SIZE + 12];
+ struct reserve_mem_table *map;
+ void *start, *end;
+
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ start = phys_to_virt(map->start);
+ end = start + map->size - 1;
+ snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
+ free_reserved_area(start, end, 0, buf);
+ map->size = 0;
+
+ return 1;
+}
+
/*
* Parse reserve_mem=nn:align:name
*/
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 2be6b9112808..8660908850dc 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -490,6 +490,19 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
}
/* Cgroup1: threshold notifications & softlimit tree updates */
+
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_NTARGETS,
+};
+
struct memcg1_events_percpu {
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
@@ -568,8 +581,59 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
local_irq_restore(flags);
}
-void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
+/**
+ * memcg1_swapout - transfer a memsw charge to swap
+ * @folio: folio whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @folio to @entry.
+ */
+void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
+ struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (!do_memsw_account())
+ return;
+
+ memcg = folio_memcg(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
+ if (!memcg)
+ return;
+
+ /*
+ * In case the memcg owning these pages has been offlined and doesn't
+ * have an ID allocated to it anymore, charge the closest online
+ * ancestor for the swap instead and transfer the memory+swap charge.
+ */
+ swap_memcg = mem_cgroup_id_get_online(memcg);
+ nr_entries = folio_nr_pages(folio);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+
+ folio_unqueue_deferred_split(folio);
+ folio->memcg_data = 0;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, nr_entries);
+
+ if (memcg != swap_memcg) {
+ if (!mem_cgroup_is_root(swap_memcg))
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
+ }
+
/*
* Interrupts should be disabled here because the caller holds the
* i_pages lock which is taken with interrupts-off. It is
@@ -581,6 +645,42 @@ void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
+
+ css_put(&memcg->css);
+}
+
+/*
+ * memcg1_swapin - uncharge swap slot
+ * @entry: the first swap entry for which the pages are charged
+ * @nr_pages: number of pages which will be uncharged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
+{
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account()) {
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ mem_cgroup_uncharge_swap(entry, nr_pages);
+ }
}
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
@@ -1855,9 +1955,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > MAX_SWAPPINESS)
return -EINVAL;
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
+ pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
+ "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
- else
+ } else
WRITE_ONCE(vm_swappiness, val);
return 0;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 144d71b65907..6358464bb416 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -7,21 +7,6 @@
/* Cgroup v1 and v2 common declarations */
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages);
-
-static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
-{
- if (mem_cgroup_is_root(memcg))
- return 0;
-
- return try_charge_memcg(memcg, gfp_mask, nr_pages);
-}
-
-void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
-
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -37,38 +22,29 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n);
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-/* Whether legacy memory+swap accounting is active */
-static inline bool do_memsw_account(void)
-{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
-}
-
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremented by the number of pages. This counter is used
- * to trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_NTARGETS,
-};
-
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
void drain_all_stock(struct mem_cgroup *root_memcg);
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
-unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
-unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item);
-unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
int memory_stat_show(struct seq_file *m, void *v);
+void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg);
+
/* Cgroup v1-specific declarations */
#ifdef CONFIG_MEMCG_V1
+/* Whether legacy memory+swap accounting is active */
+static inline bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
+}
+
+unsigned long memcg_events_local(struct mem_cgroup *memcg, int event);
+unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx);
+unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item);
bool memcg1_alloc_events(struct mem_cgroup *memcg);
void memcg1_free_events(struct mem_cgroup *memcg);
@@ -96,7 +72,6 @@ void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
void memcg1_oom_recover(struct mem_cgroup *memcg);
void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
-void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg);
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_memory, int nid);
@@ -119,6 +94,7 @@ extern struct cftype mem_cgroup_legacy_files[];
#else /* CONFIG_MEMCG_V1 */
+static inline bool do_memsw_account(void) { return false; }
static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; }
static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
@@ -134,8 +110,6 @@ static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {}
static inline void memcg1_commit_charge(struct folio *folio,
struct mem_cgroup *memcg) {}
-static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {}
-
static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
unsigned long pgpgout,
unsigned long nr_memory, int nid) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 46f8b372d212..421740f1bcdc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -315,6 +315,7 @@ static const unsigned int memcg_node_stat_items[] = {
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
PGDEMOTE_KHUGEPAGED,
+ PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
NR_HUGETLB,
#endif
@@ -431,9 +432,11 @@ static const unsigned int memcg_vm_event_stat[] = {
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
+ PGSCAN_PROACTIVE,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSTEAL_PROACTIVE,
PGFAULT,
PGMAJFAULT,
PGREFILL,
@@ -706,6 +709,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
trace_mod_memcg_state(memcg, idx, val);
}
+#ifdef CONFIG_MEMCG_V1
/* idx can be of type enum memcg_stat_item or node_stat_item. */
unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
@@ -722,6 +726,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
#endif
return x;
}
+#endif
static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx,
@@ -869,6 +874,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events[i]);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
@@ -878,6 +884,7 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
return READ_ONCE(memcg->vmstats->events_local[i]);
}
+#endif
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
@@ -1390,6 +1397,7 @@ static const struct memory_stat memory_stats[] = {
{ "pgdemote_kswapd", PGDEMOTE_KSWAPD },
{ "pgdemote_direct", PGDEMOTE_DIRECT },
{ "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
+ { "pgdemote_proactive", PGDEMOTE_PROACTIVE },
#ifdef CONFIG_NUMA_BALANCING
{ "pgpromote_success", PGPROMOTE_SUCCESS },
#endif
@@ -1432,6 +1440,7 @@ static int memcg_page_state_output_unit(int item)
case PGDEMOTE_KSWAPD:
case PGDEMOTE_DIRECT:
case PGDEMOTE_KHUGEPAGED:
+ case PGDEMOTE_PROACTIVE:
#ifdef CONFIG_NUMA_BALANCING
case PGPROMOTE_SUCCESS:
#endif
@@ -1447,11 +1456,13 @@ unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item)
memcg_page_state_output_unit(item);
}
+#ifdef CONFIG_MEMCG_V1
unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
{
return memcg_page_state_local(memcg, item) *
memcg_page_state_output_unit(item);
}
+#endif
#ifdef CONFIG_HUGETLB_PAGE
static bool memcg_accounts_hugetlb(void)
@@ -1503,10 +1514,12 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_PROACTIVE) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_PROACTIVE) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
@@ -1566,16 +1579,23 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[SEQ_BUF_SIZE];
struct seq_buf s;
+ unsigned long memory_failcnt;
lockdep_assert_held(&oom_lock);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memory_failcnt = atomic_long_read(&memcg->memory_events[MEMCG_MAX]);
+ else
+ memory_failcnt = memcg->memory.failcnt;
+
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
+ K((u64)READ_ONCE(memcg->memory.max)), memory_failcnt);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->swap)),
- K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
+ K((u64)READ_ONCE(memcg->swap.max)),
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
#ifdef CONFIG_MEMCG_V1
else {
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
@@ -1739,7 +1759,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
struct memcg_stock_pcp {
- local_lock_t stock_lock;
+ localtry_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1754,7 +1774,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+ .stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
};
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -1766,6 +1786,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
+ * @gfp_mask: allocation mask.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
@@ -1773,7 +1794,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
*
* returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
struct memcg_stock_pcp *stock;
unsigned int stock_pages;
@@ -1783,7 +1805,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (!gfpflags_allow_spinning(gfp_mask))
+ return ret;
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ }
stock = this_cpu_ptr(&memcg_stock);
stock_pages = READ_ONCE(stock->nr_pages);
@@ -1792,7 +1818,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -1831,14 +1857,14 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -1868,9 +1894,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ /*
+ * In case of unlikely failure to lock percpu stock_lock
+ * uncharge memcg directly.
+ */
+ if (mem_cgroup_is_root(memcg))
+ return;
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ return;
+ }
__refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -1921,9 +1958,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old;
+ unsigned long flags;
stock = &per_cpu(memcg_stock, cpu);
+
+ /* drain_obj_stock requires stock_lock */
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ old = drain_obj_stock(stock);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
drain_stock(stock);
+ obj_cgroup_put(old);
return 0;
}
@@ -2198,8 +2244,8 @@ out:
css_put(&memcg->css);
}
-int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
@@ -2213,9 +2259,13 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long pflags;
retry:
- if (consume_stock(memcg, nr_pages))
+ if (consume_stock(memcg, nr_pages, gfp_mask))
return 0;
+ if (!gfpflags_allow_spinning(gfp_mask))
+ /* Avoid the refill and flush of the older stock */
+ batch = nr_pages;
+
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2388,6 +2438,15 @@ done_restock:
return 0;
}
+static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages)
+{
+ if (mem_cgroup_is_root(memcg))
+ return 0;
+
+ return try_charge_memcg(memcg, gfp_mask, nr_pages);
+}
+
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
@@ -2612,7 +2671,8 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
- refill_stock(memcg, nr_pages);
+ if (!mem_cgroup_is_root(memcg))
+ refill_stock(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -2645,6 +2705,23 @@ out:
return ret;
}
+static struct obj_cgroup *page_objcg(const struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ if (mem_cgroup_disabled() || !memcg_data)
+ return NULL;
+
+ VM_BUG_ON_PAGE((memcg_data & OBJEXTS_FLAGS_MASK) != MEMCG_DATA_KMEM,
+ page);
+ return (struct obj_cgroup *)(memcg_data - MEMCG_DATA_KMEM);
+}
+
+static void page_set_objcg(struct page *page, const struct obj_cgroup *objcg)
+{
+ page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
+}
+
/**
* __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
* @page: page to charge
@@ -2663,8 +2740,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
if (!ret) {
obj_cgroup_get(objcg);
- page->memcg_data = (unsigned long)objcg |
- MEMCG_DATA_KMEM;
+ page_set_objcg(page, objcg);
return 0;
}
}
@@ -2678,19 +2754,31 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct folio *folio = page_folio(page);
- struct obj_cgroup *objcg;
+ struct obj_cgroup *objcg = page_objcg(page);
unsigned int nr_pages = 1 << order;
- if (!folio_memcg_kmem(folio))
+ if (!objcg)
return;
- objcg = __folio_objcg(folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
- folio->memcg_data = 0;
+ page->memcg_data = 0;
obj_cgroup_put(objcg);
}
+/* Replace the stock objcg with objcg, return the old objcg */
+static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock,
+ struct obj_cgroup *objcg)
+{
+ struct obj_cgroup *old = NULL;
+
+ old = drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
+ ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ WRITE_ONCE(stock->cached_objcg, objcg);
+ return old;
+}
+
static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
@@ -2699,7 +2787,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@@ -2708,11 +2796,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
* changes.
*/
if (READ_ONCE(stock->cached_objcg) != objcg) {
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- WRITE_ONCE(stock->cached_objcg, objcg);
+ old = replace_stock_objcg(stock, objcg);
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
@@ -2752,7 +2836,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
__mod_objcg_mlstate(objcg, pgdat, idx, nr);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -2762,7 +2846,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
@@ -2770,7 +2854,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2862,15 +2946,11 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
- old = drain_obj_stock(stock);
- obj_cgroup_get(objcg);
- WRITE_ONCE(stock->cached_objcg, objcg);
- stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
- ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
+ old = replace_stock_objcg(stock, objcg);
allow_uncharge = true; /* Allow uncharge when objcg changes */
}
stock->nr_bytes += nr_bytes;
@@ -2880,7 +2960,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
if (nr_pages)
@@ -3028,25 +3108,33 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
}
/*
- * Because folio_memcg(head) is not set on tails, set it now.
+ * The objcg is only set on the first page, so transfer it to all the
+ * other pages.
*/
-void split_page_memcg(struct page *head, int old_order, int new_order)
+void split_page_memcg(struct page *page, unsigned order)
{
- struct folio *folio = page_folio(head);
- int i;
- unsigned int old_nr = 1 << old_order;
- unsigned int new_nr = 1 << new_order;
+ struct obj_cgroup *objcg = page_objcg(page);
+ unsigned int i, nr = 1 << order;
- if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ if (!objcg)
return;
- for (i = new_nr; i < old_nr; i += new_nr)
- folio_page(folio, i)->memcg_data = folio->memcg_data;
+ for (i = 1; i < nr; i++)
+ page_set_objcg(&page[i], objcg);
- if (folio_memcg_kmem(folio))
- obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
- else
- css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
+ obj_cgroup_get_many(objcg, nr - 1);
+}
+
+void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
+ unsigned new_order)
+{
+ unsigned new_refs;
+
+ if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
+ return;
+
+ new_refs = (1 << (old_order - new_order)) - 1;
+ css_get_many(&__folio_memcg(folio)->css, new_refs);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3374,7 +3462,7 @@ void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
refcount_add(n, &memcg->id.ref);
}
-void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
@@ -3389,6 +3477,24 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
mem_cgroup_id_put_many(memcg, 1);
}
+struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+{
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
+ /*
+ * The root cgroup cannot be destroyed, so it's refcount must
+ * always be >= 1.
+ */
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
+ VM_BUG_ON(1);
+ break;
+ }
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ }
+ return memcg;
+}
+
/**
* mem_cgroup_from_id - look up a memcg from a memcg id
* @id: the memcg id to look up
@@ -3424,6 +3530,16 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
}
#endif
+static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn)
+{
+ if (!pn)
+ return;
+
+ free_percpu(pn->lruvec_stats_percpu);
+ kfree(pn->lruvec_stats);
+ kfree(pn);
+}
+
static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -3448,23 +3564,10 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
memcg->nodeinfo[node] = pn;
return true;
fail:
- kfree(pn->lruvec_stats);
- kfree(pn);
+ free_mem_cgroup_per_node_info(pn);
return false;
}
-static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
-{
- struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
-
- if (!pn)
- return;
-
- free_percpu(pn->lruvec_stats_percpu);
- kfree(pn->lruvec_stats);
- kfree(pn);
-}
-
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
@@ -3472,7 +3575,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
obj_cgroup_put(memcg->orig_objcg);
for_each_node(node)
- free_mem_cgroup_per_node_info(memcg, node);
+ free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3565,6 +3668,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg, *old_memcg;
+ bool memcg_on_dfl = cgroup_subsys_on_dfl(memory_cgrp_subsys);
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc(parent);
@@ -3582,9 +3686,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
- page_counter_init(&memcg->memory, &parent->memory, true);
+ page_counter_init(&memcg->memory, &parent->memory, memcg_on_dfl);
page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
+ memcg->memory.track_failcnt = !memcg_on_dfl;
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
page_counter_init(&memcg->kmem, &parent->kmem, false);
page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
@@ -3602,7 +3707,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ if (memcg_on_dfl && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
if (!cgroup_memory_nobpf)
@@ -4004,7 +4109,7 @@ static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
WRITE_ONCE(peer_ctx->value, usage);
/* initial write, register watcher */
- if (ofp->value == -1)
+ if (ofp->value == OFP_PEAK_UNSET)
list_add(&ofp->list, watchers);
WRITE_ONCE(ofp->value, usage);
@@ -4166,6 +4271,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
+ cond_resched();
}
memcg_wb_domain_size_changed(memcg);
@@ -4576,40 +4682,6 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
return ret;
}
-/*
- * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
- * @entry: the first swap entry for which the pages are charged
- * @nr_pages: number of pages which will be uncharged
- *
- * Call this function after successfully adding the charged page to swapcache.
- *
- * Note: This function assumes the page for which swap slot is being uncharged
- * is order 0 page.
- */
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
-{
- /*
- * Cgroup1's unified memory+swap counter has been charged with the
- * new swapcache page, finish the transfer by uncharging the swap
- * slot. The swap slot would also get uncharged when it dies, but
- * it can stick around indefinitely and we'd count the page twice
- * the entire time.
- *
- * Cgroup2 has separate resource counters for memory and swap,
- * so this is a non-issue here. Memory and swap charge lifetimes
- * correspond 1:1 to page and swap slot lifetimes: we charge the
- * page to memory here, and uncharge swap when the slot is freed.
- */
- if (do_memsw_account()) {
- /*
- * The swap entry might not get freed for a long time,
- * let's not wait for it. The page already received a
- * memory+swap charge, drop the swap entry duplicate.
- */
- mem_cgroup_uncharge_swap(entry, nr_pages);
- }
-}
-
struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_memory;
@@ -4860,7 +4932,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return memcg1_charge_skmem(memcg, nr_pages, gfp_mask);
- if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
+ if (try_charge_memcg(memcg, gfp_mask, nr_pages) == 0) {
mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
return true;
}
@@ -4935,81 +5007,6 @@ static int __init mem_cgroup_init(void)
subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_SWAP
-static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
-{
- while (!refcount_inc_not_zero(&memcg->id.ref)) {
- /*
- * The root cgroup cannot be destroyed, so it's refcount must
- * always be >= 1.
- */
- if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
- VM_BUG_ON(1);
- break;
- }
- memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- memcg = root_mem_cgroup;
- }
- return memcg;
-}
-
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @folio: folio whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @folio to @entry.
- */
-void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
-{
- struct mem_cgroup *memcg, *swap_memcg;
- unsigned int nr_entries;
-
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
-
- if (mem_cgroup_disabled())
- return;
-
- if (!do_memsw_account())
- return;
-
- memcg = folio_memcg(folio);
-
- VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
- if (!memcg)
- return;
-
- /*
- * In case the memcg owning these pages has been offlined and doesn't
- * have an ID allocated to it anymore, charge the closest online
- * ancestor for the swap instead and transfer the memory+swap charge.
- */
- swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = folio_nr_pages(folio);
- /* Get references for the tail pages, too */
- if (nr_entries > 1)
- mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
- mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
-
- swap_cgroup_record(folio, entry);
-
- folio_unqueue_deferred_split(folio);
- folio->memcg_data = 0;
-
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, nr_entries);
-
- if (memcg != swap_memcg) {
- if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, nr_entries);
- page_counter_uncharge(&memcg->memsw, nr_entries);
- }
-
- memcg1_swapout(folio, memcg);
- css_put(&memcg->css);
-}
-
/**
* __mem_cgroup_try_charge_swap - try charging swap space for a folio
* @folio: folio being added to swap
@@ -5054,7 +5051,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, entry);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
return 0;
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 37f7be57c2f5..c64df1343059 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -259,7 +259,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
/*
- * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
*/
if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
@@ -337,7 +337,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr)
unsigned long vm_flags = *vm_flags_ptr;
unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
- /* If a private matting then writability is irrelevant. */
+ /* If a private mapping then writability is irrelevant. */
if (!(mask & VM_SHARED))
return 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 995a15eb67e2..b91a33fb6c69 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
- if (pud_devmap(*pud))
+ if (pud_trans_huge(*pud))
return PUD_SHIFT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
- if (pmd_devmap(*pmd))
+ if (pmd_trans_huge(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
if (!pte)
return 0;
ptent = ptep_get(pte);
- if (pte_present(ptent) && pte_devmap(ptent))
+ if (pte_present(ptent))
ret = PAGE_SHIFT;
pte_unmap(pte);
return ret;
@@ -881,12 +881,17 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
(void *)&priv);
+ /*
+ * ret = 1 when CMCI wins, regardless of whether try_to_unmap()
+ * succeeds or fails, then kill the process with SIGBUS.
+ * ret = 0 when poison page is a clean page and it's dropped, no
+ * SIGBUS is needed.
+ */
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
- else
- ret = 0;
mmap_read_unlock(p->mm);
- return ret > 0 ? -EHWPOISON : -EFAULT;
+
+ return ret > 0 ? -EHWPOISON : 0;
}
/*
@@ -1556,11 +1561,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- struct address_space *mapping;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1572,7 +1601,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1580,6 +1609,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1589,8 +1620,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1613,29 +1642,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1643,9 +1649,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
@@ -2211,9 +2215,13 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks held.
*
- * Return: 0 for successfully handled the memory error,
- * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
- * < 0(except -EOPNOTSUPP) on failure.
+ * Return:
+ * 0 - success,
+ * -ENXIO - memory not managed by the kernel
+ * -EOPNOTSUPP - hwpoison_filter() filtered the error event,
+ * -EHWPOISON - the page was already poisoned, potentially
+ * kill process,
+ * other negative values - failure.
*/
int memory_failure(unsigned long pfn, int flags)
{
diff --git a/mm/memory.c b/mm/memory.c
index 539c0f7c6d54..2d8c265fc7d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,7 +76,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -95,14 +94,6 @@
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
-#ifndef CONFIG_NUMA
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
-
-struct page *mem_map;
-EXPORT_SYMBOL(mem_map);
-#endif
-
static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);
@@ -122,14 +113,6 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
}
/*
- * A number of key systems in x86 including ioremap() rely on the assumption
- * that high_memory defines the upper bound on direct map memory, then end
- * of ZONE_NORMAL.
- */
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-
-/*
* Randomize the address space (stacks, mmaps, brk, etc.).
*
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
@@ -716,42 +699,53 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
}
#endif
+/**
+ * restore_exclusive_pte - Restore a device-exclusive entry
+ * @vma: VMA covering @address
+ * @folio: the mapped folio
+ * @page: the mapped folio page
+ * @address: the virtual address
+ * @ptep: pte pointer into the locked page table mapping the folio page
+ * @orig_pte: pte value at @ptep
+ *
+ * Restore a device-exclusive non-swap entry to an ordinary present pte.
+ *
+ * The folio and the page table must be locked, and MMU notifiers must have
+ * been called to invalidate any (exclusive) device mappings.
+ *
+ * Locking the folio makes sure that anybody who just converted the pte to
+ * a device-exclusive entry can map it into the device to make forward
+ * progress without others converting it back until the folio was unlocked.
+ *
+ * If the folio lock ever becomes an issue, we can stop relying on the folio
+ * lock; it might make some scenarios with heavy thrashing less likely to
+ * make forward progress, but these scenarios might not be valid use cases.
+ *
+ * Note that the folio lock does not protect against all cases of concurrent
+ * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
+ * must use MMU notifiers to sync against any concurrent changes.
+ */
static void restore_exclusive_pte(struct vm_area_struct *vma,
- struct page *page, unsigned long address,
- pte_t *ptep)
+ struct folio *folio, struct page *page, unsigned long address,
+ pte_t *ptep, pte_t orig_pte)
{
- struct folio *folio = page_folio(page);
- pte_t orig_pte;
pte_t pte;
- swp_entry_t entry;
- orig_pte = ptep_get(ptep);
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(orig_pte);
if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
- else if (is_writable_device_exclusive_entry(entry))
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
-
- VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
- PageAnonExclusive(page)), folio);
-
- /*
- * No need to take a page reference as one was already
- * created when the swap entry was made.
- */
- if (folio_test_anon(folio))
- folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
- else
- /*
- * Currently device exclusive access only supports anonymous
- * memory so the entry shouldn't point to a filebacked page.
- */
- WARN_ON_ONCE(1);
+ if ((vma->vm_flags & VM_WRITE) &&
+ can_change_pte_writable(vma, address, pte)) {
+ if (folio_test_dirty(folio))
+ pte = pte_mkdirty(pte);
+ pte = pte_mkwrite(pte, vma);
+ }
set_pte_at(vma->vm_mm, address, ptep, pte);
/*
@@ -765,16 +759,15 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
* Tries to restore an exclusive pte if the page lock can be acquired without
* sleeping.
*/
-static int
-try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr)
+static int try_restore_exclusive_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
- swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+ struct folio *folio = page_folio(page);
- if (trylock_page(page)) {
- restore_exclusive_pte(vma, page, addr, src_pte);
- unlock_page(page);
+ if (folio_trylock(folio)) {
+ restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
+ folio_unlock(folio);
return 0;
}
@@ -854,7 +847,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
folio_get(folio);
rss[mm_counter(folio)]++;
/* Cannot fail as these pages cannot get pinned. */
- folio_try_dup_anon_rmap_pte(folio, page, src_vma);
+ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
/*
* We do not preserve soft-dirty information, because so
@@ -880,7 +873,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* (ie. COW) mappings.
*/
VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
- if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
return -EBUSY;
return -ENOENT;
} else if (is_pte_marker_entry(entry)) {
@@ -1008,14 +1001,14 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
- nr, src_vma))) {
+ nr, dst_vma, src_vma))) {
folio_ref_sub(folio, nr);
return -EAGAIN;
}
rss[MM_ANONPAGES] += nr;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_ptes(folio, page, nr);
+ folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
if (any_writable)
@@ -1033,7 +1026,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
+ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
/* Page may be pinned, we have to copy. */
folio_put(folio);
err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
@@ -1043,7 +1036,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
} else {
- folio_dup_file_rmap_pte(folio, page);
+ folio_dup_file_rmap_pte(folio, page, dst_vma);
rss[mm_counter_file(folio)]++;
}
@@ -1363,12 +1356,12 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
- unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
+ unsigned long next, pfn;
bool is_cow;
int ret;
@@ -1379,11 +1372,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- /*
- * We do not free on error cases below as remove_vma
- * gets called on error from higher level routine
- */
- ret = track_pfn_copy(src_vma);
+ ret = track_pfn_copy(dst_vma, src_vma, &pfn);
if (ret)
return ret;
}
@@ -1420,7 +1409,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
- untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1430,6 +1418,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
+ if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -1623,8 +1613,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
*/
WARN_ON_ONCE(!vma_is_anonymous(vma));
rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
+ folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
} else if (!non_swap_entry(entry)) {
/* Genuine swap entries, hence a private anon pages */
@@ -1719,7 +1708,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pmd_t pmdval;
unsigned long start = addr;
bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
- bool direct_reclaim = false;
+ bool direct_reclaim = true;
int nr;
retry:
@@ -1734,8 +1723,10 @@ retry:
do {
bool any_skipped = false;
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
+ }
nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
&force_flush, &force_break, &any_skipped);
@@ -1743,11 +1734,20 @@ retry:
can_reclaim_pt = false;
if (unlikely(force_break)) {
addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
break;
}
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
- if (can_reclaim_pt && addr == end)
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
add_mm_rss_vec(mm, rss);
@@ -2125,19 +2125,39 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
}
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
- unsigned long addr, struct page *page, pgprot_t prot)
+ unsigned long addr, struct page *page,
+ pgprot_t prot, bool mkwrite)
{
struct folio *folio = page_folio(page);
- pte_t pteval;
+ pte_t pteval = ptep_get(pte);
+
+ if (!pte_none(pteval)) {
+ if (!mkwrite)
+ return -EBUSY;
+
+ /* see insert_pfn(). */
+ if (pte_pfn(pteval) != page_to_pfn(page)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
+ return -EFAULT;
+ }
+ pteval = maybe_mkwrite(pteval, vma);
+ pteval = pte_mkyoung(pteval);
+ if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
+ update_mmu_cache(vma, addr, pte);
+ return 0;
+ }
- if (!pte_none(ptep_get(pte)))
- return -EBUSY;
/* Ok, finally just insert the thing.. */
pteval = mk_pte(page, prot);
if (unlikely(is_zero_folio(folio))) {
pteval = pte_mkspecial(pteval);
} else {
folio_get(folio);
+ pteval = mk_pte(page, prot);
+ if (mkwrite) {
+ pteval = pte_mkyoung(pteval);
+ pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
+ }
inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
}
@@ -2146,7 +2166,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
}
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, pgprot_t prot)
+ struct page *page, pgprot_t prot, bool mkwrite)
{
int retval;
pte_t *pte;
@@ -2159,7 +2179,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
+ mkwrite);
pte_unmap_unlock(pte, ptl);
out:
return retval;
@@ -2173,7 +2194,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
err = validate_page_before_insert(vma, page);
if (err)
return err;
- return insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -2309,7 +2330,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
BUG_ON(vma->vm_flags & VM_PFNMAP);
vm_flags_set(vma, VM_MIXEDMAP);
}
- return insert_page(vma, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2589,7 +2610,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- err = insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
@@ -2602,6 +2623,26 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+ bool write)
+{
+ pgprot_t pgprot = vmf->vma->vm_page_prot;
+ unsigned long addr = vmf->address;
+ int err;
+
+ if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ err = insert_page(vmf->vma, addr, page, pgprot, write);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
@@ -3040,8 +3081,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -3664,19 +3707,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
return ret;
}
-static bool wp_can_reuse_anon_folio(struct folio *folio,
- struct vm_area_struct *vma)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
{
+ bool exclusive = false;
+
+ /* Let's just free up a large folio if only a single page is mapped. */
+ if (folio_large_mapcount(folio) <= 1)
+ return false;
+
/*
- * We could currently only reuse a subpage of a large folio if no
- * other subpages of the large folios are still mapped. However,
- * let's just consistently not reuse subpages even if we could
- * reuse in that scenario, and give back a large folio a bit
- * sooner.
+ * The assumption for anonymous folios is that each page can only get
+ * mapped once into each MM. The only exception are KSM folios, which
+ * are always small.
+ *
+ * Each taken mapcount must be paired with exactly one taken reference,
+ * whereby the refcount must be incremented before the mapcount when
+ * mapping a page, and the refcount must be decremented after the
+ * mapcount when unmapping a page.
+ *
+ * If all folio references are from mappings, and all mappings are in
+ * the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large(folio))
+ if (folio_test_large_maybe_mapped_shared(folio))
return false;
+ VM_WARN_ON_ONCE(folio_test_ksm(folio));
+ VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
+ VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
+
+ if (unlikely(folio_test_swapcache(folio))) {
+ /*
+ * Note: freeing up the swapcache will fail if some PTEs are
+ * still swap entries.
+ */
+ if (!folio_trylock(folio))
+ return false;
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
+ return false;
+
+ /* Stabilize the mapcount vs. refcount and recheck. */
+ folio_lock_large_mapcount(folio);
+ VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));
+
+ if (folio_test_large_maybe_mapped_shared(folio))
+ goto unlock;
+ if (folio_large_mapcount(folio) != folio_ref_count(folio))
+ goto unlock;
+
+ VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
+ folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
+
+ /*
+ * Do we need the folio lock? Likely not. If there would have been
+ * references from page migration/swapout, we would have detected
+ * an additional folio reference and never ended up here.
+ */
+ exclusive = true;
+unlock:
+ folio_unlock_large_mapcount(folio);
+ return exclusive;
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static bool wp_can_reuse_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
+ return __wp_can_reuse_large_anon_folio(folio, vma);
+
/*
* We have to verify under folio lock: these early checks are
* just an optimization to avoid locking the folio and freeing
@@ -3785,13 +3895,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
+ * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if (!vmf->page)
+ if (!vmf->page || is_fsdax_page(vmf->page)) {
+ vmf->page = NULL;
return wp_pfn_shared(vmf);
+ }
return wp_page_shared(vmf, folio);
}
@@ -3981,7 +4093,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
folio_put(folio);
return ret;
}
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
mmu_notifier_invalidate_range_start(&range);
@@ -3989,7 +4101,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
- restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
+ restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
+ vmf->pte, vmf->orig_pte);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -4337,10 +4450,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ struct dev_pagemap *pgmap;
+
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pgmap = page_pgmap(vmf->page);
+ ret = pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
@@ -4394,7 +4515,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
need_clear_cache = true;
- mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+ memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -5172,7 +5293,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5211,7 +5336,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5247,9 +5373,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -5558,7 +5684,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
* Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
*flags |= TNF_SHARED;
/*
* For memory tiering mode, cpupid of slow memory page is used
@@ -5732,17 +5858,8 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
-
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- /*
- * Currently we just emit PAGE_SIZE for our fault events, so don't allow
- * a huge fault if we have a pre content watch on this file. This would
- * be trivial to support, but there would need to be tests to ensure
- * this works properly and those don't exist currently.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5766,9 +5883,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5791,9 +5905,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5811,9 +5922,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -6347,6 +6455,88 @@ fail:
#endif
#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+{
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
+ }
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
+
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
+}
+
/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
* stable and not isolated. If the VMA is not found or is being modified the
@@ -6364,15 +6554,17 @@ retry:
if (!vma)
goto inval;
- if (!vma_start_read(vma))
- goto inval;
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
- /* Check if the VMA got isolated after we found it */
- if (vma->detached) {
- vma_end_read(vma);
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
+ /* Failed to lock the VMA */
+ goto inval;
}
/*
* At this point, we have a stable reference to a VMA: The VMA is
@@ -6381,8 +6573,9 @@ retry:
* fields are accessible for RCU readers.
*/
- /* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(vma->vm_mm != mm ||
+ address < vma->vm_start || address >= vma->vm_end))
goto inval_end_read;
rcu_read_unlock();
@@ -6477,6 +6670,7 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
args->lock = lock;
args->ptep = ptep;
args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+ args->addr_mask = addr_mask;
args->pgprot = pgprot;
args->writable = writable;
args->special = special;
@@ -6636,7 +6830,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
- unsigned long prot = 0;
+ pgprot_t prot = __pgprot(0);
void __iomem *maddr;
int offset = offset_in_page(addr);
int ret = -EINVAL;
@@ -6646,7 +6840,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pfnmap_start(&args))
return -EINVAL;
- prot = pgprot_val(args.pgprot);
+ prot = args.pgprot;
phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
writable = args.writable;
follow_pfnmap_end(&args);
@@ -6661,7 +6855,7 @@ retry:
if (follow_pfnmap_start(&args))
goto out_unmap;
- if ((prot != pgprot_val(args.pgprot)) ||
+ if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
(phys_addr != (args.pfn << PAGE_SHIFT)) ||
(writable != args.writable)) {
follow_pfnmap_end(&args);
@@ -6803,6 +6997,124 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ void *old_buf = buf;
+ int err = 0;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return -EFAULT;
+
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ while (len) {
+ int bytes, offset, retval;
+ void *maddr;
+ struct page *page;
+ struct vm_area_struct *vma = NULL;
+
+ page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
+ if (IS_ERR(page)) {
+ /*
+ * Treat as a total failure for now until we decide how
+ * to handle the CONFIG_HAVE_IOREMAP_PROT case and
+ * stack expansion.
+ */
+ *(char *)buf = '\0';
+ err = -EFAULT;
+ goto out;
+ }
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE - 1);
+ if (bytes > PAGE_SIZE - offset)
+ bytes = PAGE_SIZE - offset;
+
+ maddr = kmap_local_page(page);
+ retval = strscpy(buf, maddr + offset, bytes);
+ if (retval >= 0) {
+ /* Found the end of the string */
+ buf += retval;
+ unmap_and_put_page(page, maddr);
+ break;
+ }
+
+ buf += bytes - 1;
+ /*
+ * Because strscpy always NUL terminates we need to
+ * copy the last byte in the page if we are going to
+ * load more pages
+ */
+ if (bytes != len) {
+ addr += bytes - 1;
+ copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
+ buf += 1;
+ addr += 1;
+ }
+ len -= bytes;
+
+ unmap_and_put_page(page, maddr);
+ }
+
+out:
+ mmap_read_unlock(mm);
+ if (err)
+ return err;
+ return buf - old_buf;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/*
* Print the name of a VMA.
*/
@@ -6835,10 +7147,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e3655f07dd6e..8305483de38b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1813,35 +1813,26 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page = pfn_to_page(pfn);
folio = page_folio(page);
- /*
- * No reference or lock is held on the folio, so it might
- * be modified concurrently (e.g. split). As such,
- * folio_nr_pages() may read garbage. This is fine as the outer
- * loop will revisit the split folio later.
- */
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (keep the unmap as the catch all safety net).
- */
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
- if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
- continue;
- }
-
- if (!folio_try_get(folio))
- continue;
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
- if (unlikely(page_folio(page) != folio))
goto put_folio;
+ }
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bbaadbeeb291..b28a1e6ae096 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -642,11 +673,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
- (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
+ (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
if (!folio_isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
@@ -1033,10 +1064,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
diff --git a/mm/memremap.c b/mm/memremap.c
index 40d4547ce514..2aebc1b192da 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -458,8 +458,9 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_folio(struct folio *folio)
{
- if (WARN_ON_ONCE(!folio->page.pgmap->ops ||
- !folio->page.pgmap->ops->page_free))
+ struct dev_pagemap *pgmap = folio->pgmap;
+
+ if (WARN_ON_ONCE(!pgmap))
return;
mem_cgroup_uncharge(folio);
@@ -484,19 +485,42 @@ void free_zone_device_folio(struct folio *folio)
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
* to clear folio->mapping.
+ *
+ * FS DAX pages clear the mapping when the folio->share count hits
+ * zero which indicating the page has been removed from the file
+ * system mapping.
*/
- folio->mapping = NULL;
- folio->page.pgmap->ops->page_free(folio_page(folio, 0));
+ if (pgmap->type != MEMORY_DEVICE_FS_DAX &&
+ pgmap->type != MEMORY_DEVICE_GENERIC)
+ folio->mapping = NULL;
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ break;
+ pgmap->ops->page_free(folio_page(folio, 0));
+ put_dev_pagemap(pgmap);
+ break;
- if (folio->page.pgmap->type != MEMORY_DEVICE_PRIVATE &&
- folio->page.pgmap->type != MEMORY_DEVICE_COHERENT)
+ case MEMORY_DEVICE_GENERIC:
/*
* Reset the refcount to 1 to prepare for handing out the page
* again.
*/
folio_set_count(folio, 1);
- else
- put_dev_pagemap(folio->page.pgmap);
+ break;
+
+ case MEMORY_DEVICE_FS_DAX:
+ wake_up_var(&folio->page);
+ break;
+
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ break;
+ pgmap->ops->page_free(folio_page(folio, 0));
+ break;
+ }
}
void zone_device_page_init(struct page *page)
@@ -505,26 +529,8 @@ void zone_device_page_init(struct page *page)
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
*/
- WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
set_page_count(page, 1);
lock_page(page);
}
EXPORT_SYMBOL_GPL(zone_device_page_init);
-
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX)
- return false;
-
- /*
- * fsdax page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (folio_ref_sub_return(folio, refs) == 1)
- wake_up_var(&folio->_refcount);
- return true;
-}
-EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
-#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index fb19a18892c8..f3ee6d8d5e2e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -202,7 +202,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+ VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm))
@@ -328,7 +328,7 @@ static bool remove_migration_pte(struct folio *folio,
folio_add_file_rmap_pte(folio, new, vma);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
- if (vma->vm_flags & VM_LOCKED)
+ if (READ_ONCE(vma->vm_flags) & VM_LOCKED)
mlock_drain_local();
trace_remove_migration_pte(pvmw.address, pte_val(pte),
@@ -518,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
@@ -2228,7 +2226,7 @@ static int __add_folio_for_migration(struct folio *folio, int node,
if (folio_nid(folio) == node)
return 0;
- if (folio_likely_mapped_shared(folio) && !migrate_all)
+ if (folio_maybe_mapped_shared(folio) && !migrate_all)
return -EACCES;
if (folio_test_hugetlb(folio)) {
@@ -2653,11 +2651,10 @@ int migrate_misplaced_folio_prepare(struct folio *folio,
* processes with execute permissions as they are probably
* shared libraries.
*
- * See folio_likely_mapped_shared() on possible imprecision
+ * See folio_maybe_mapped_shared() on possible imprecision
* when we cannot easily detect if a folio is shared.
*/
- if ((vma->vm_flags & VM_EXEC) &&
- folio_likely_mapped_shared(folio))
+ if ((vma->vm_flags & VM_EXEC) && folio_maybe_mapped_shared(folio))
return -EACCES;
/*
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 9cf26592ac93..3158afe7eb23 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@ again:
folio_get(folio);
spin_unlock(ptl);
+ /* FIXME: we don't expect THP for fault_folio */
+ if (WARN_ON_ONCE(fault_folio == folio))
+ return migrate_vma_collect_skip(start, end,
+ walk);
if (unlikely(!folio_trylock(folio)))
return migrate_vma_collect_skip(start, end,
walk);
ret = split_folio(folio);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
if (ret)
return migrate_vma_collect_skip(start, end,
@@ -106,6 +113,7 @@ again:
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ struct dev_pagemap *pgmap;
unsigned long mpfn = 0, pfn;
struct folio *folio;
struct page *page;
@@ -133,9 +141,10 @@ again:
goto next;
page = pfn_swap_entry_to_page(entry);
+ pgmap = page_pgmap(page);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
- page->pgmap->owner != migrate->pgmap_owner)
+ pgmap->owner != migrate->pgmap_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
@@ -152,12 +161,16 @@ again:
}
page = vm_normal_page(migrate->vma, addr, pte);
if (page && !is_zone_device_page(page) &&
- !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
- else if (page && is_device_coherent_page(page) &&
- (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
- page->pgmap->owner != migrate->pgmap_owner))
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
goto next;
+ } else if (page && is_device_coherent_page(page)) {
+ pgmap = page_pgmap(page);
+
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ pgmap->owner != migrate->pgmap_owner)
+ goto next;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -192,7 +205,7 @@ again:
* optimisation to avoid walking the rmap later with
* try_to_migrate().
*/
- if (folio_trylock(folio)) {
+ if (fault_folio == folio || folio_trylock(folio)) {
bool anon_exclusive;
pte_t swp_pte;
@@ -204,7 +217,8 @@ again:
if (folio_try_share_anon_rmap_pte(folio, page)) {
set_pte_at(mm, addr, ptep, pte);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
mpfn = 0;
goto next;
@@ -363,6 +377,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i, restore = 0;
bool allow_drain = true;
unsigned long unmapped = 0;
@@ -427,7 +443,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
restore--;
}
@@ -536,6 +553,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (args->fault_page && !is_device_private_page(args->fault_page))
return -EINVAL;
+ if (args->fault_page && !PageLocked(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -799,19 +818,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_pages);
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
- unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i;
for (i = 0; i < npages; i++) {
@@ -824,6 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!page) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -834,29 +848,43 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
dst = src;
}
+ if (!folio_is_zone_device(dst))
+ folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
- folio_unlock(src);
-
- if (folio_is_zone_device(src))
- folio_put(src);
- else
- folio_putback_lru(src);
+ if (fault_folio != src)
+ folio_unlock(src);
+ folio_put(src);
if (dst != src) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
- if (folio_is_zone_device(dst))
- folio_put(dst);
- else
- folio_putback_lru(dst);
+ folio_put(dst);
}
}
}
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
+{
+ return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
EXPORT_SYMBOL(migrate_device_finalize);
/**
@@ -872,10 +900,27 @@ EXPORT_SYMBOL(migrate_device_finalize);
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
{
- migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+ __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+ migrate->fault_page);
}
EXPORT_SYMBOL(migrate_vma_finalize);
+static unsigned long migrate_device_pfn_lock(unsigned long pfn)
+{
+ struct folio *folio;
+
+ folio = folio_get_nontail_page(pfn_to_page(pfn));
+ if (!folio)
+ return 0;
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return 0;
+ }
+
+ return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+}
+
/**
* migrate_device_range() - migrate device private pfns to normal memory.
* @src_pfns: array large enough to hold migrating source device private pfns.
@@ -900,29 +945,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
{
unsigned long i, pfn;
- for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct folio *folio;
+ for (pfn = start, i = 0; i < npages; pfn++, i++)
+ src_pfns[i] = migrate_device_pfn_lock(pfn);
- folio = folio_get_nontail_page(pfn_to_page(pfn));
- if (!folio) {
- src_pfns[i] = 0;
- continue;
- }
+ migrate_device_unmap(src_pfns, npages, NULL);
- if (!folio_trylock(folio)) {
- src_pfns[i] = 0;
- folio_put(folio);
- continue;
- }
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
- src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- }
+/**
+ * migrate_device_pfns() - migrate device private pfns to normal memory.
+ * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * array of device pages to migrate.
+ */
+int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; i++)
+ src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
migrate_device_unmap(src_pfns, npages, NULL);
return 0;
}
-EXPORT_SYMBOL(migrate_device_range);
+EXPORT_SYMBOL(migrate_device_pfns);
/*
* Migrate a device coherent folio back to normal memory. The caller should have
diff --git a/mm/mincore.c b/mm/mincore.c
index d6bd19e520fc..832f29f46767 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -239,7 +239,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
start = untagged_addr(start);
/* Check the start address: needs to be page-aligned.. */
- if (start & ~PAGE_MASK)
+ if (unlikely(start & ~PAGE_MASK))
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
diff --git a/mm/mlock.c b/mm/mlock.c
index cde076fa7d5e..3cb72b579ffd 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -368,6 +368,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (is_huge_zero_pmd(*pmd))
goto out;
folio = pmd_folio(*pmd);
+ if (folio_is_zone_device(folio))
+ goto out;
if (vma->vm_flags & VM_LOCKED)
mlock_folio(folio);
else
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2630cc30147e..84f14fa12d0d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -30,12 +30,28 @@
#include <linux/crash_dump.h>
#include <linux/execmem.h>
#include <linux/vmstat.h>
+#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
#include "shuffle.h"
#include <asm/setup.h>
+#ifndef CONFIG_NUMA
+unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
+EXPORT_SYMBOL(mem_map);
+#endif
+
+/*
+ * high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.
+ */
+void *high_memory;
+EXPORT_SYMBOL(high_memory);
+
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
@@ -438,7 +454,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* was requested by the user
*/
required_movablecore =
- roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ round_up(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
@@ -545,11 +561,11 @@ restart:
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
zone_movable_pfn[nid] =
- roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
if (zone_movable_pfn[nid] >= end_pfn)
@@ -649,6 +665,28 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+/*
+ * Initialize a reserved page unconditionally, finding its zone first.
+ */
+void __meminit __init_page_from_nid(unsigned long pfn, int nid)
+{
+ pg_data_t *pgdat;
+ int zid;
+
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_spans_pfn(zone, pfn))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+
+ if (pageblock_aligned(pfn))
+ set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
@@ -705,26 +743,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn, int nid)
+static void __meminit init_deferred_page(unsigned long pfn, int nid)
{
- pg_data_t *pgdat;
- int zid;
-
if (early_page_initialised(pfn, nid))
return;
- pgdat = NODE_DATA(nid);
-
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
-
- if (zone_spans_pfn(zone, pfn))
- break;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
-
- if (pageblock_aligned(pfn))
- set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+ __init_page_from_nid(pfn, nid);
}
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
@@ -739,7 +763,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn, int nid)
+static inline void init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -760,7 +784,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
- init_reserved_page(start_pfn, nid);
+ init_deferred_page(start_pfn, nid);
/*
* no need for atomic set_bit because the struct
@@ -960,19 +984,19 @@ static void __init memmap_init(void)
}
}
-#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
- * section_end].
+ * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
+ * for FLATMEM.
* Append the pages in this hole to the highest zone in the last
* node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
*/
+#ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
+#else
+ end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
#endif
+ if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
@@ -998,7 +1022,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
- page->pgmap = pgmap;
+ page_folio(page)->pgmap = pgmap;
page->zone_device_data = NULL;
/*
@@ -1017,12 +1041,25 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
}
/*
- * ZONE_DEVICE pages are released directly to the driver page allocator
- * which will set the page count to 1 when allocating the page.
+ * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
+ * directly to the driver page allocator which will set the page count
+ * to 1 when allocating the page.
+ *
+ * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
+ * their refcount reset to one whenever they are freed (ie. after
+ * their refcount drops to 0).
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_COHERENT)
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_FS_DAX:
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ case MEMORY_DEVICE_PCI_P2PDMA:
set_page_count(page, 0);
+ break;
+
+ case MEMORY_DEVICE_GENERIC:
+ break;
+ }
}
/*
@@ -1431,7 +1468,7 @@ void __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_SPARSEMEM
/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
* Start by making sure zonesize is a multiple of pageblock_order by rounding
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
* round what is now in bits to nearest long in bits, then return it in
@@ -1442,10 +1479,10 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
unsigned long usemapsize;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = round_up(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, BITS_PER_LONG);
+ usemapsize = round_up(usemapsize, BITS_PER_LONG);
return usemapsize / BITS_PER_BYTE;
}
@@ -1617,7 +1654,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/*
- * The zone's endpoints aren't required to be MAX_PAGE_ORDER
+ * The zone's endpoints aren't required to be MAX_PAGE_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
@@ -1633,14 +1670,15 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
__func__, pgdat->node_id, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NUMA
+
/* the global mem_map is just set as node 0's */
- if (pgdat == NODE_DATA(0)) {
- mem_map = NODE_DATA(0)->node_mem_map;
- if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
- }
-#endif
+ WARN_ON(pgdat != NODE_DATA(0));
+
+ mem_map = pgdat->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+
+ max_mapnr = end - start;
}
#else
static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
@@ -1747,6 +1785,27 @@ static bool arch_has_descending_max_zone_pfns(void)
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
+static void set_high_memory(void)
+{
+ phys_addr_t highmem = memblock_end_of_DRAM();
+
+ /*
+ * Some architectures (e.g. ARM) set high_memory very early and
+ * use it in arch setup code.
+ * If an architecture already set high_memory don't overwrite it
+ */
+ if (high_memory)
+ return;
+
+#ifdef CONFIG_HIGHMEM
+ if (arch_has_descending_max_zone_pfns() ||
+ highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]))
+ highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]);
+#endif
+
+ high_memory = phys_to_virt(highmem - 1) + 1;
+}
+
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -1861,11 +1920,16 @@ void __init free_area_init(unsigned long *max_zone_pfn)
}
}
+ for_each_node_state(nid, N_MEMORY)
+ sparse_vmemmap_init_nid_late(nid);
+
calc_nr_kernel_pages();
memmap_init();
/* disable hash distribution for systems with a single node */
fixup_hashdist();
+
+ set_high_memory();
}
/**
@@ -2251,6 +2315,15 @@ void __init init_cma_reserved_pageblock(struct page *page)
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
+/*
+ * Similar to above, but only set the migrate type and stats.
+ */
+void __init init_cma_pageblock(struct page *page)
+{
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+ adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
+}
#endif
void set_zone_contiguous(struct zone *zone)
@@ -2275,6 +2348,31 @@ void set_zone_contiguous(struct zone *zone)
zone->contiguous = true;
}
+/*
+ * Check if a PFN range intersects multiple zones on one or more
+ * NUMA nodes. Specify the @nid argument if it is known that this
+ * PFN range is on one node, NUMA_NO_NODE otherwise.
+ */
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct zone *zone, *izone = NULL;
+
+ for_each_zone(zone) {
+ if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid)
+ continue;
+
+ if (zone_intersects(zone, start_pfn, nr_pages)) {
+ if (izone != NULL)
+ return true;
+ izone = zone;
+ }
+
+ }
+
+ return false;
+}
+
static void __init mem_init_print_info(void);
void __init page_alloc_init_late(void)
{
@@ -2636,11 +2734,22 @@ static void __init mem_init_print_info(void)
);
}
+void __init __weak arch_mm_preinit(void)
+{
+}
+
+void __init __weak mem_init(void)
+{
+}
+
/*
* Set up kernel memory allocators
*/
void __init mm_core_init(void)
{
+ arch_mm_preinit();
+ hugetlb_bootmem_alloc();
+
/* Initializations relying on SMP setup */
BUILD_BUG_ON(MAX_ZONELISTS > 2);
build_all_zonelists(NULL);
@@ -2656,6 +2765,7 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+ memblock_free_all();
mem_init();
kmem_cache_init();
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index cda01071c7b1..bd210aaf7ebd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1305,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, /* unreachable = */ true);
+ vma_mark_detached(vma);
+ remove_vma(vma);
count++;
cond_resched();
vma = vma_next(&vmi);
@@ -1543,6 +1544,57 @@ struct vm_area_struct *_install_special_mapping(
&special_mapping_vmops);
}
+#ifdef CONFIG_SYSCTL
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+static const struct ctl_table mmap_table[] = {
+ {
+ .procname = "max_map_count",
+ .data = &sysctl_max_map_count,
+ .maxlen = sizeof(sysctl_max_map_count),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+ {
+ .procname = "legacy_va_layout",
+ .data = &sysctl_legacy_va_layout,
+ .maxlen = sizeof(sysctl_legacy_va_layout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
+};
+#endif /* CONFIG_SYSCTL */
+
/*
* initialise the percpu counter for VM
*/
@@ -1552,6 +1604,9 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", mmap_table);
+#endif
}
/*
@@ -1693,6 +1748,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
struct vm_area_struct *next;
struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
BUG_ON(new_start > new_end);
@@ -1707,7 +1763,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
/*
* cover the whole range: [new_start, old_end)
*/
- vmg.vma = vma;
+ vmg.middle = vma;
if (vma_expand(&vmg))
return -ENOMEM;
@@ -1715,8 +1771,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
* move the page tables downwards, on failure we rely on
* process cleanup to remove whatever mess we made.
*/
- if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false, true))
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
return -ENOMEM;
tlb_gather_mmu(&tlb, mm);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7aa6f18c500b..db7ba4a725d6 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -246,8 +246,16 @@ static void __tlb_remove_table_free(struct mmu_table_batch *batch)
* IRQs delays the completion of the TLB flush we can never observe an already
* freed page.
*
- * Architectures that do not have this (PPC) need to delay the freeing by some
- * other means, this is that means.
+ * Not all systems IPI every CPU for this purpose:
+ *
+ * - Some architectures have HW support for cross-CPU synchronisation of TLB
+ * flushes, so there's no IPI at all.
+ *
+ * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
+ * with the hypervisor to defer flushing on preempted vCPUs.
+ *
+ * Such systems need to delay the freeing by some other means, this is that
+ * means.
*
* What we do is batch the freed directory pages (tables) and RCU free them.
* We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 516b1d847e2c..62c1f7945741 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -133,7 +133,7 @@ static long change_pte_range(struct mmu_gather *tlb,
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
(folio_maybe_dma_pinned(folio) ||
- folio_likely_mapped_shared(folio)))
+ folio_maybe_mapped_shared(folio)))
continue;
/*
@@ -225,14 +225,6 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_writable_device_exclusive_entry(entry)) {
- entry = make_readable_device_exclusive_entry(
- swp_offset(entry));
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(oldpte))
- newpte = pte_swp_mksoft_dirty(newpte);
- if (pte_swp_uffd_wp(oldpte))
- newpte = pte_swp_mkuffd_wp(newpte);
} else if (is_pte_marker_entry(entry)) {
/*
* Ignore error swap entries unconditionally,
@@ -607,7 +599,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
unsigned long start, unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long oldflags = vma->vm_flags;
+ unsigned long oldflags = READ_ONCE(vma->vm_flags);
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned int mm_cp_flags = 0;
unsigned long charged = 0;
@@ -627,7 +619,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
* uncommon case, so doesn't need to be very optimized.
*/
if (arch_has_pfn_modify_check() &&
- (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & VM_ACCESS_FLAGS) == 0) {
pgprot_t new_pgprot = vm_get_page_prot(newflags);
@@ -676,7 +668,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
* held in write mode.
*/
vma_start_write(vma);
- vm_flags_reset(vma, newflags);
+ vm_flags_reset_once(vma, newflags);
if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index cff7f552f909..7db9da609c84 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -32,6 +32,45 @@
#include "internal.h"
+/* Classify the kind of remap operation being performed. */
+enum mremap_type {
+ MREMAP_INVALID, /* Initial state. */
+ MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */
+ MREMAP_SHRINK, /* old_len > new_len. */
+ MREMAP_EXPAND, /* old_len < new_len. */
+};
+
+/*
+ * Describes a VMA mremap() operation and is threaded throughout it.
+ *
+ * Any of the fields may be mutated by the operation, however these values will
+ * always accurately reflect the remap (for instance, we may adjust lengths and
+ * delta to account for hugetlb alignment).
+ */
+struct vma_remap_struct {
+ /* User-provided state. */
+ unsigned long addr; /* User-specified address from which we remap. */
+ unsigned long old_len; /* Length of range being remapped. */
+ unsigned long new_len; /* Desired new length of mapping. */
+ unsigned long flags; /* user-specified MREMAP_* flags. */
+ unsigned long new_addr; /* Optionally, desired new address. */
+
+ /* uffd state. */
+ struct vm_userfaultfd_ctx *uf;
+ struct list_head *uf_unmap_early;
+ struct list_head *uf_unmap;
+
+ /* VMA state, determined in do_mremap(). */
+ struct vm_area_struct *vma;
+
+ /* Internal state, determined in do_mremap(). */
+ unsigned long delta; /* Absolute delta of old_len,new_len. */
+ bool mlocked; /* Was the VMA mlock()'d? */
+ enum mremap_type remap_type; /* expand, shrink, etc. */
+ bool mmap_locked; /* Is mm currently write-locked? */
+ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
+};
+
static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
@@ -69,8 +108,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return pmd;
}
-static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -83,13 +121,12 @@ static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
return pud_alloc(mm, p4d, addr);
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
- pud = alloc_new_pud(mm, vma, addr);
+ pud = alloc_new_pud(mm, addr);
if (!pud)
return NULL;
@@ -133,17 +170,19 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- unsigned long old_addr, unsigned long old_end,
- struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr, bool need_rmap_locks)
+static int move_ptes(struct pagetable_move_control *pmc,
+ unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
{
+ struct vm_area_struct *vma = pmc->old;
bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long new_addr = pmc->new_addr;
+ unsigned long old_end = old_addr + extent;
unsigned long len = old_end - old_addr;
int err = 0;
@@ -165,7 +204,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* serialize access to individual ptes, but only rmap traversal
* order guarantees that we won't miss both the old and new ptes).
*/
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
take_rmap_locks(vma);
/*
@@ -239,7 +278,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
out:
- if (need_rmap_locks)
+ if (pmc->need_rmap_locks)
drop_rmap_locks(vma);
return err;
}
@@ -254,10 +293,11 @@ static inline bool arch_supports_page_table_move(void)
#endif
#ifdef CONFIG_HAVE_MOVE_PMD
-static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
+static bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
bool res = false;
pmd_t pmd;
@@ -303,7 +343,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ old_ptl = pmd_lock(mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -320,7 +360,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE);
out_unlock:
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -329,19 +369,19 @@ out_unlock:
return res;
}
#else
-static inline bool move_normal_pmd(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
- pmd_t *new_pmd)
+static inline bool move_normal_pmd(struct pagetable_move_control *pmc,
+ pmd_t *old_pmd, pmd_t *new_pmd)
{
return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
-static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -367,7 +407,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -379,7 +419,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
pud_populate(mm, new_pud, pud_pgtable(pud));
- flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+ flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -387,19 +427,19 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static inline bool move_normal_pud(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
- pud_t *new_pud)
+static inline bool move_normal_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
return false;
}
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
{
spinlock_t *old_ptl, *new_ptl;
+ struct vm_area_struct *vma = pmc->old;
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
@@ -414,7 +454,7 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
*/
- old_ptl = pud_lock(vma->vm_mm, old_pud);
+ old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -427,8 +467,8 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
/* Set the new pud */
/* mark soft_ditry when we add pud level soft dirty support */
- set_pud_at(mm, new_addr, new_pud, pud);
- flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+ set_pud_at(mm, pmc->new_addr, new_pud, pud);
+ flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -436,8 +476,9 @@ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
#else
-static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+static bool move_huge_pud(struct pagetable_move_control *pmc,
+ pud_t *old_pud, pud_t *new_pud)
+
{
WARN_ON_ONCE(1);
return false;
@@ -458,10 +499,12 @@ enum pgt_entry {
* destination pgt_entry.
*/
static __always_inline unsigned long get_extent(enum pgt_entry entry,
- unsigned long old_addr, unsigned long old_end,
- unsigned long new_addr)
+ struct pagetable_move_control *pmc)
{
unsigned long next, extent, mask, size;
+ unsigned long old_addr = pmc->old_addr;
+ unsigned long old_end = pmc->old_end;
+ unsigned long new_addr = pmc->new_addr;
switch (entry) {
case HPAGE_PMD:
@@ -491,37 +534,50 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
}
/*
+ * Should move_pgt_entry() acquire the rmap locks? This is either expressed in
+ * the PMC, or overridden in the case of normal, larger page tables.
+ */
+static bool should_take_rmap_locks(struct pagetable_move_control *pmc,
+ enum pgt_entry entry)
+{
+ switch (entry) {
+ case NORMAL_PMD:
+ case NORMAL_PUD:
+ return true;
+ default:
+ return pmc->need_rmap_locks;
+ }
+}
+
+/*
* Attempts to speedup the move by moving entry at the level corresponding to
* pgt_entry. Returns true if the move was successful, else false.
*/
-static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long new_addr,
- void *old_entry, void *new_entry, bool need_rmap_locks)
+static bool move_pgt_entry(struct pagetable_move_control *pmc,
+ enum pgt_entry entry, void *old_entry, void *new_entry)
{
bool moved = false;
+ bool need_rmap_locks = should_take_rmap_locks(pmc, entry);
/* See comment in move_ptes() */
if (need_rmap_locks)
- take_rmap_locks(vma);
+ take_rmap_locks(pmc->old);
switch (entry) {
case NORMAL_PMD:
- moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pmd(pmc, old_entry, new_entry);
break;
case NORMAL_PUD:
- moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ moved = move_normal_pud(pmc, old_entry, new_entry);
break;
case HPAGE_PMD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pmd(vma, old_addr, new_addr, old_entry,
+ move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry,
new_entry);
break;
case HPAGE_PUD:
moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- move_huge_pud(vma, old_addr, new_addr, old_entry,
- new_entry);
+ move_huge_pud(pmc, old_entry, new_entry);
break;
default:
@@ -530,7 +586,7 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
}
if (need_rmap_locks)
- drop_rmap_locks(vma);
+ drop_rmap_locks(pmc->old);
return moved;
}
@@ -541,8 +597,9 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
* the VMA that is created to span the source and destination of the move,
* so we make an exception for it.
*/
-static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
- unsigned long mask, bool for_stack)
+static bool can_align_down(struct pagetable_move_control *pmc,
+ struct vm_area_struct *vma, unsigned long addr_to_align,
+ unsigned long mask)
{
unsigned long addr_masked = addr_to_align & mask;
@@ -551,11 +608,11 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
* of the corresponding VMA, we can't align down or we will destroy part
* of the current mapping.
*/
- if (!for_stack && vma->vm_start != addr_to_align)
+ if (!pmc->for_stack && vma->vm_start != addr_to_align)
return false;
/* In the stack case we explicitly permit in-VMA alignment. */
- if (for_stack && addr_masked >= vma->vm_start)
+ if (pmc->for_stack && addr_masked >= vma->vm_start)
return true;
/*
@@ -565,163 +622,390 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali
return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
}
-/* Opportunistically realign to specified boundary for faster copy. */
-static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
- unsigned long *new_addr, struct vm_area_struct *new_vma,
- unsigned long mask, bool for_stack)
+/*
+ * Determine if are in fact able to realign for efficiency to a higher page
+ * table boundary.
+ */
+static bool can_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
{
+ unsigned long align_mask = ~pagetable_mask;
+ unsigned long old_align = pmc->old_addr & align_mask;
+ unsigned long new_align = pmc->new_addr & align_mask;
+ unsigned long pagetable_size = align_mask + 1;
+ unsigned long old_align_next = pagetable_size - old_align;
+
+ /*
+ * We don't want to have to go hunting for VMAs from the end of the old
+ * VMA to the next page table boundary, also we want to make sure the
+ * operation is wortwhile.
+ *
+ * So ensure that we only perform this realignment if the end of the
+ * range being copied reaches or crosses the page table boundary.
+ *
+ * boundary boundary
+ * .<- old_align -> .
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . <----------------.----------->
+ * . len_in
+ * <------------------------------->
+ * . pagetable_size .
+ * . <---------------->
+ * . old_align_next .
+ */
+ if (pmc->len_in < old_align_next)
+ return false;
+
/* Skip if the addresses are already aligned. */
- if ((*old_addr & ~mask) == 0)
- return;
+ if (old_align == 0)
+ return false;
/* Only realign if the new and old addresses are mutually aligned. */
- if ((*old_addr & ~mask) != (*new_addr & ~mask))
- return;
+ if (old_align != new_align)
+ return false;
/* Ensure realignment doesn't cause overlap with existing mappings. */
- if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
- !can_align_down(new_vma, *new_addr, mask, for_stack))
+ if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
+ !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
+ return false;
+
+ return true;
+}
+
+/*
+ * Opportunistically realign to specified boundary for faster copy.
+ *
+ * Consider an mremap() of a VMA with page table boundaries as below, and no
+ * preceding VMAs from the lower page table boundary to the start of the VMA,
+ * with the end of the range reaching or crossing the page table boundary.
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * . pmc->old_addr . pmc->old_end
+ * . <---------------------------->
+ * . move these page tables
+ *
+ * If we proceed with moving page tables in this scenario, we will have a lot of
+ * work to do traversing old page tables and establishing new ones in the
+ * destination across multiple lower level page tables.
+ *
+ * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
+ * page table boundary, so we can simply copy a single page table entry for the
+ * aligned portion of the VMA instead:
+ *
+ * boundary boundary
+ * . |----------------.-----------|
+ * . | vma . |
+ * . |----------------.-----------|
+ * pmc->old_addr . pmc->old_end
+ * <------------------------------------------->
+ * . move these page tables
+ */
+static void try_realign_addr(struct pagetable_move_control *pmc,
+ unsigned long pagetable_mask)
+{
+
+ if (!can_realign_addr(pmc, pagetable_mask))
return;
- *old_addr = *old_addr & mask;
- *new_addr = *new_addr & mask;
+ /*
+ * Simply align to page table boundaries. Note that we do NOT update the
+ * pmc->old_end value, and since the move_page_tables() operation spans
+ * from [old_addr, old_end) (offsetting new_addr as it is performed),
+ * this simply changes the start of the copy, not the end.
+ */
+ pmc->old_addr &= pagetable_mask;
+ pmc->new_addr &= pagetable_mask;
+}
+
+/* Is the page table move operation done? */
+static bool pmc_done(struct pagetable_move_control *pmc)
+{
+ return pmc->old_addr >= pmc->old_end;
+}
+
+/* Advance to the next page table, offset by extent bytes. */
+static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent)
+{
+ pmc->old_addr += extent;
+ pmc->new_addr += extent;
}
-unsigned long move_page_tables(struct vm_area_struct *vma,
- unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len,
- bool need_rmap_locks, bool for_stack)
+/*
+ * Determine how many bytes in the specified input range have had their page
+ * tables moved so far.
+ */
+static unsigned long pmc_progress(struct pagetable_move_control *pmc)
{
- unsigned long extent, old_end;
+ unsigned long orig_old_addr = pmc->old_end - pmc->len_in;
+ unsigned long old_addr = pmc->old_addr;
+
+ /*
+ * Prevent negative return values when {old,new}_addr was realigned but
+ * we broke out of the loop in move_page_tables() for the first PMD
+ * itself.
+ */
+ return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr;
+}
+
+unsigned long move_page_tables(struct pagetable_move_control *pmc)
+{
+ unsigned long extent;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
pud_t *old_pud, *new_pud;
+ struct mm_struct *mm = pmc->old->vm_mm;
- if (!len)
+ if (!pmc->len_in)
return 0;
- old_end = old_addr + len;
-
- if (is_vm_hugetlb_page(vma))
- return move_hugetlb_page_tables(vma, new_vma, old_addr,
- new_addr, len);
+ if (is_vm_hugetlb_page(pmc->old))
+ return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
+ pmc->new_addr, pmc->len_in);
/*
* If possible, realign addresses to PMD boundary for faster copy.
* Only realign if the mremap copying hits a PMD boundary.
*/
- if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
- try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
- for_stack);
+ try_realign_addr(pmc, PMD_MASK);
- flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
- old_addr, old_end);
+ flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm,
+ pmc->old_addr, pmc->old_end);
mmu_notifier_invalidate_range_start(&range);
- for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
+ for (; !pmc_done(pmc); pmc_next(pmc, extent)) {
cond_resched();
/*
* If extent is PUD-sized try to speed up the move by moving at the
* PUD level if possible.
*/
- extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+ extent = get_extent(NORMAL_PUD, pmc);
- old_pud = get_old_pud(vma->vm_mm, old_addr);
+ old_pud = get_old_pud(mm, pmc->old_addr);
if (!old_pud)
continue;
- new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ new_pud = alloc_new_pud(mm, pmc->new_addr);
if (!new_pud)
break;
if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
if (extent == HPAGE_PUD_SIZE) {
- move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, need_rmap_locks);
+ move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
/* We ignore and continue on error? */
continue;
}
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-
- if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, true))
+ if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud))
continue;
}
- extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
- old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ extent = get_extent(NORMAL_PMD, pmc);
+ old_pmd = get_old_pmd(mm, pmc->old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
+ new_pmd = alloc_new_pmd(mm, pmc->new_addr);
if (!new_pmd)
break;
again:
if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
- move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, need_rmap_locks))
+ move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
- split_huge_pmd(vma, old_pmd, old_addr);
+ split_huge_pmd(pmc->old, old_pmd, pmc->old_addr);
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
extent == PMD_SIZE) {
/*
* If the extent is PMD-sized, try to speed the move by
* moving at the PMD level if possible.
*/
- if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, true))
+ if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd))
continue;
}
if (pmd_none(*old_pmd))
continue;
- if (pte_alloc(new_vma->vm_mm, new_pmd))
+ if (pte_alloc(pmc->new->vm_mm, new_pmd))
break;
- if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0)
goto again;
}
mmu_notifier_invalidate_range_end(&range);
+ return pmc_progress(pmc);
+}
+
+/* Set vrm->delta to the difference in VMA size specified by user. */
+static void vrm_set_delta(struct vma_remap_struct *vrm)
+{
+ vrm->delta = abs_diff(vrm->old_len, vrm->new_len);
+}
+
+/* Determine what kind of remap this is - shrink, expand or no resize at all. */
+static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm)
+{
+ if (vrm->delta == 0)
+ return MREMAP_NO_RESIZE;
+
+ if (vrm->old_len > vrm->new_len)
+ return MREMAP_SHRINK;
+
+ return MREMAP_EXPAND;
+}
+
+/*
+ * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs
+ * overlapping?
+ */
+static bool vrm_overlaps(struct vma_remap_struct *vrm)
+{
+ unsigned long start_old = vrm->addr;
+ unsigned long start_new = vrm->new_addr;
+ unsigned long end_old = vrm->addr + vrm->old_len;
+ unsigned long end_new = vrm->new_addr + vrm->new_len;
+
/*
- * Prevent negative return values when {old,new}_addr was realigned
- * but we broke out of the above loop for the first PMD itself.
+ * start_old end_old
+ * |-----------|
+ * | |
+ * |-----------|
+ * |-------------|
+ * | |
+ * |-------------|
+ * start_new end_new
*/
- if (old_addr < old_end - len)
- return 0;
+ if (end_old > start_new && end_new > start_old)
+ return true;
- return len + old_addr - old_end; /* how much done */
+ return false;
}
-static unsigned long move_vma(struct vm_area_struct *vma,
- unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr,
- bool *locked, unsigned long flags,
- struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
+/* Do the mremap() flags require that the new_addr parameter be specified? */
+static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
{
- long to_account = new_len - old_len;
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *new_vma;
- unsigned long vm_flags = vma->vm_flags;
- unsigned long new_pgoff;
- unsigned long moved_len;
- unsigned long account_start = 0;
- unsigned long account_end = 0;
- unsigned long hiwater_vm;
- int err = 0;
- bool need_rmap_locks;
- struct vma_iterator vmi;
+ return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
+}
+
+/*
+ * Find an unmapped area for the requested vrm->new_addr.
+ *
+ * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only
+ * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to
+ * mmap(), otherwise this is equivalent to mmap() specifying a NULL address.
+ *
+ * Returns 0 on success (with vrm->new_addr updated), or an error code upon
+ * failure.
+ */
+static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
+{
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long map_flags = 0;
+ /* Page Offset _into_ the VMA. */
+ pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff_t pgoff = vma->vm_pgoff + internal_pgoff;
+ unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0;
+ unsigned long res;
+
+ if (vrm->flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff,
+ map_flags);
+ if (IS_ERR_VALUE(res))
+ return res;
+
+ vrm->new_addr = res;
+ return 0;
+}
+
+/*
+ * Keep track of pages which have been added to the memory mapping. If the VMA
+ * is accounted, also check to see if there is sufficient memory.
+ *
+ * Returns true on success, false if insufficient memory to charge.
+ */
+static bool vrm_charge(struct vma_remap_struct *vrm)
+{
+ unsigned long charged;
+
+ if (!(vrm->vma->vm_flags & VM_ACCOUNT))
+ return true;
+
+ /*
+ * If we don't unmap the old mapping, then we account the entirety of
+ * the length of the new one. Otherwise it's just the delta in size.
+ */
+ if (vrm->flags & MREMAP_DONTUNMAP)
+ charged = vrm->new_len >> PAGE_SHIFT;
+ else
+ charged = vrm->delta >> PAGE_SHIFT;
+
+
+ /* This accounts 'charged' pages of memory. */
+ if (security_vm_enough_memory_mm(current->mm, charged))
+ return false;
+
+ vrm->charged = charged;
+ return true;
+}
+
+/*
+ * an error has occurred so we will not be using vrm->charged memory. Unaccount
+ * this memory if the VMA is accounted.
+ */
+static void vrm_uncharge(struct vma_remap_struct *vrm)
+{
+ if (!(vrm->vma->vm_flags & VM_ACCOUNT))
+ return;
+
+ vm_unacct_memory(vrm->charged);
+ vrm->charged = 0;
+}
+
+/*
+ * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to
+ * account for 'bytes' memory used, and if locked, indicate this in the VRM so
+ * we can handle this correctly later.
+ */
+static void vrm_stat_account(struct vma_remap_struct *vrm,
+ unsigned long bytes)
+{
+ unsigned long pages = bytes >> PAGE_SHIFT;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+
+ vm_stat_account(mm, vma->vm_flags, pages);
+ if (vma->vm_flags & VM_LOCKED) {
+ mm->locked_vm += pages;
+ vrm->mlocked = true;
+ }
+}
+
+/*
+ * Perform checks before attempting to write a VMA prior to it being
+ * moved.
+ */
+static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
+{
+ unsigned long err = 0;
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long old_addr = vrm->addr;
+ unsigned long old_len = vrm->old_len;
+ unsigned long dummy = vma->vm_flags;
/*
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
- if (mm->map_count >= sysctl_max_map_count - 3)
+ if (current->mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
- if (unlikely(flags & MREMAP_DONTUNMAP))
- to_account = new_len;
-
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
err = vma->vm_ops->may_split(vma, old_addr);
@@ -739,61 +1023,239 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* so KSM can come around to merge on vma and new_vma afterwards.
*/
err = ksm_madvise(vma, old_addr, old_addr + old_len,
- MADV_UNMERGEABLE, &vm_flags);
+ MADV_UNMERGEABLE, &dummy);
if (err)
return err;
- if (vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
- return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Unmap source VMA for VMA move, turning it from a copy to a move, being
+ * careful to ensure we do not underflow memory account while doing so if an
+ * accountable move.
+ *
+ * This is best effort, if we fail to unmap then we simply try to correct
+ * accounting and exit.
+ */
+static void unmap_source_vma(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = vrm->addr;
+ unsigned long len = vrm->old_len;
+ struct vm_area_struct *vma = vrm->vma;
+ VMA_ITERATOR(vmi, mm, addr);
+ int err;
+ unsigned long vm_start;
+ unsigned long vm_end;
+ /*
+ * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
+ * function implies that we unmap the original VMA, which seems
+ * contradictory.
+ *
+ * However, this occurs when this operation was attempted and an error
+ * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
+ * we actually _do_ want it be unaccounted.
+ */
+ bool accountable_move = (vma->vm_flags & VM_ACCOUNT) &&
+ !(vrm->flags & MREMAP_DONTUNMAP);
+
+ /*
+ * So we perform a trick here to prevent incorrect accounting. Any merge
+ * or new VMA allocation performed in copy_vma() does not adjust
+ * accounting, it is expected that callers handle this.
+ *
+ * And indeed we already have, accounting appropriately in the case of
+ * both in vrm_charge().
+ *
+ * However, when we unmap the existing VMA (to effect the move), this
+ * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
+ * removed pages.
+ *
+ * To avoid this we temporarily clear this flag, reinstating on any
+ * portions of the original VMA that remain.
+ */
+ if (accountable_move) {
+ vm_flags_clear(vma, VM_ACCOUNT);
+ /* We are about to split vma, so store the start/end. */
+ vm_start = vma->vm_start;
+ vm_end = vma->vm_end;
+ }
+
+ err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
+ vrm->vma = NULL; /* Invalidated. */
+ if (err) {
+ /* OOM: unable to split vma, just get accounts right */
+ vm_acct_memory(len >> PAGE_SHIFT);
+ return;
+ }
+
+ /*
+ * If we mremap() from a VMA like this:
+ *
+ * addr end
+ * | |
+ * v v
+ * |-------------|
+ * | |
+ * |-------------|
+ *
+ * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
+ * we'll end up with:
+ *
+ * addr end
+ * | |
+ * v v
+ * |---| |---|
+ * | A | | B |
+ * |---| |---|
+ *
+ * The VMI is still pointing at addr, so vma_prev() will give us A, and
+ * a subsequent or lone vma_next() will give as B.
+ *
+ * do_vmi_munmap() will have restored the VMI back to addr.
+ */
+ if (accountable_move) {
+ unsigned long end = addr + len;
+
+ if (vm_start < addr) {
+ struct vm_area_struct *prev = vma_prev(&vmi);
+
+ vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */
+ }
+
+ if (vm_end > end) {
+ struct vm_area_struct *next = vma_next(&vmi);
+
+ vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */
+ }
}
+}
+
+/*
+ * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
+ * process. Additionally handle an error occurring on moving of page tables,
+ * where we reset vrm state to cause unmapping of the new VMA.
+ *
+ * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
+ * error code.
+ */
+static int copy_vma_and_data(struct vma_remap_struct *vrm,
+ struct vm_area_struct **new_vma_ptr)
+{
+ unsigned long internal_offset = vrm->addr - vrm->vma->vm_start;
+ unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
+ unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
+ unsigned long moved_len;
+ struct vm_area_struct *vma = vrm->vma;
+ struct vm_area_struct *new_vma;
+ int err = 0;
+ PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
- vma_start_write(vma);
- new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
- &need_rmap_locks);
+ new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
+ &pmc.need_rmap_locks);
if (!new_vma) {
- if (vm_flags & VM_ACCOUNT)
- vm_unacct_memory(to_account >> PAGE_SHIFT);
+ vrm_uncharge(vrm);
+ *new_vma_ptr = NULL;
return -ENOMEM;
}
+ vrm->vma = vma;
+ pmc.old = vma;
+ pmc.new = new_vma;
- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
- need_rmap_locks, false);
- if (moved_len < old_len) {
+ moved_len = move_page_tables(&pmc);
+ if (moved_len < vrm->old_len)
err = -ENOMEM;
- } else if (vma->vm_ops && vma->vm_ops->mremap) {
+ else if (vma->vm_ops && vma->vm_ops->mremap)
err = vma->vm_ops->mremap(new_vma);
- }
if (unlikely(err)) {
+ PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
+ vrm->addr, moved_len);
+
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
- true, false);
- vma = new_vma;
- old_len = new_len;
- old_addr = new_addr;
- new_addr = err;
+ pmc_revert.need_rmap_locks = true;
+ move_page_tables(&pmc_revert);
+
+ vrm->vma = new_vma;
+ vrm->old_len = vrm->new_len;
+ vrm->addr = vrm->new_addr;
} else {
- mremap_userfaultfd_prep(new_vma, uf);
+ mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma)) {
+ if (is_vm_hugetlb_page(vma))
clear_vma_resv_huge_pages(vma);
- }
- /* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
- vm_flags_clear(vma, VM_ACCOUNT);
- if (vma->vm_start < old_addr)
- account_start = vma->vm_start;
- if (vma->vm_end > old_addr + old_len)
- account_end = vma->vm_end;
- }
+ /* Tell pfnmap has moved from this vma */
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_clear(vma);
+
+ *new_vma_ptr = new_vma;
+ return err;
+}
+
+/*
+ * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and
+ * account flags on remaining VMA by convention (it cannot be mlock()'d any
+ * longer, as pages in range are no longer mapped), and removing anon_vma_chain
+ * links from it (if the entire VMA was copied over).
+ */
+static void dontunmap_complete(struct vma_remap_struct *vrm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long start = vrm->addr;
+ unsigned long end = vrm->addr + vrm->old_len;
+ unsigned long old_start = vrm->vma->vm_start;
+ unsigned long old_end = vrm->vma->vm_end;
+
+ /*
+ * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old
+ * vma.
+ */
+ vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT);
+
+ /*
+ * anon_vma links of the old vma is no longer needed after its page
+ * table has been moved.
+ */
+ if (new_vma != vrm->vma && start == old_start && end == old_end)
+ unlink_anon_vmas(vrm->vma);
+
+ /* Because we won't unmap we don't need to touch locked_vm. */
+}
+
+static unsigned long move_vma(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *new_vma;
+ unsigned long hiwater_vm;
+ int err;
+
+ err = prep_move_vma(vrm);
+ if (err)
+ return err;
+
+ /* If accounted, charge the number of bytes the operation will use. */
+ if (!vrm_charge(vrm))
+ return -ENOMEM;
+
+ /* We don't want racing faults. */
+ vma_start_write(vrm->vma);
+
+ /* Perform copy step. */
+ err = copy_vma_and_data(vrm, &new_vma);
+ /*
+ * If we established the copied-to VMA, we attempt to recover from the
+ * error by setting the destination VMA to the source VMA and unmapping
+ * it below.
+ */
+ if (err && !new_vma)
+ return err;
/*
* If we failed to move page tables we still do total_vm increment
@@ -805,73 +1267,31 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
-
- /* Tell pfnmap has moved from this vma */
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(vma);
-
- if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
- /* We always clear VM_LOCKED[ONFAULT] on the old vma */
- vm_flags_clear(vma, VM_LOCKED_MASK);
-
- /*
- * anon_vma links of the old vma is no longer needed after its page
- * table has been moved.
- */
- if (new_vma != vma && vma->vm_start == old_addr &&
- vma->vm_end == (old_addr + old_len))
- unlink_anon_vmas(vma);
- /* Because we won't unmap we don't need to touch locked_vm */
- return new_addr;
- }
-
- vma_iter_init(&vmi, mm, old_addr);
- if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
- /* OOM: unable to split vma, just get accounts right */
- if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
- vm_acct_memory(old_len >> PAGE_SHIFT);
- account_start = account_end = 0;
- }
-
- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
+ vrm_stat_account(vrm, vrm->new_len);
+ if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP)))
+ dontunmap_complete(vrm, new_vma);
+ else
+ unmap_source_vma(vrm);
mm->hiwater_vm = hiwater_vm;
- /* Restore VM_ACCOUNT if one or two pieces of vma left */
- if (account_start) {
- vma = vma_prev(&vmi);
- vm_flags_set(vma, VM_ACCOUNT);
- }
-
- if (account_end) {
- vma = vma_next(&vmi);
- vm_flags_set(vma, VM_ACCOUNT);
- }
-
- return new_addr;
+ return err ? (unsigned long)err : vrm->new_addr;
}
/*
* resize_is_valid() - Ensure the vma can be resized to the new length at the give
* address.
*
- * @vma: The vma to resize
- * @addr: The old address
- * @old_len: The current size
- * @new_len: The desired size
- * @flags: The vma flags
- *
* Return 0 on success, error otherwise.
*/
-static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long flags)
+static int resize_is_valid(struct vma_remap_struct *vrm)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+ unsigned long addr = vrm->addr;
+ unsigned long old_len = vrm->old_len;
+ unsigned long new_len = vrm->new_len;
unsigned long pgoff;
/*
@@ -883,11 +1303,12 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
* behavior. As a result, fail such attempts.
*/
if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
- pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
+ current->comm, current->pid);
return -EINVAL;
}
- if ((flags & MREMAP_DONTUNMAP) &&
+ if ((vrm->flags & MREMAP_DONTUNMAP) &&
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return -EINVAL;
@@ -907,118 +1328,120 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return -EFAULT;
- if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
+ if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta))
return -EAGAIN;
- if (!may_expand_vm(mm, vma->vm_flags,
- (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
return -ENOMEM;
return 0;
}
/*
- * mremap_to() - remap a vma to a new location
- * @addr: The old address
- * @old_len: The old size
- * @new_addr: The target address
- * @new_len: The new size
- * @locked: If the returned vma is locked (VM_LOCKED)
- * @flags: the mremap flags
- * @uf: The mremap userfaultfd context
- * @uf_unmap_early: The userfaultfd unmap early context
- * @uf_unmap: The userfaultfd unmap context
+ * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
+ * execute this, optionally dropping the mmap lock when we do so.
*
- * Returns: The new address of the vma or an error.
+ * In both cases this invalidates the VMA, however if we don't drop the lock,
+ * then load the correct VMA into vrm->vma afterwards.
*/
-static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked,
- unsigned long flags, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap_early,
- struct list_head *uf_unmap)
+static unsigned long shrink_vma(struct vma_remap_struct *vrm,
+ bool drop_lock)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret;
- unsigned long map_flags = 0;
+ unsigned long unmap_start = vrm->addr + vrm->new_len;
+ unsigned long unmap_bytes = vrm->delta;
+ unsigned long res;
+ VMA_ITERATOR(vmi, mm, unmap_start);
- if (offset_in_page(new_addr))
- return -EINVAL;
+ VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK);
- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- return -EINVAL;
-
- /* Ensure the old/new locations do not overlap */
- if (addr + old_len > new_addr && new_addr + new_len > addr)
- return -EINVAL;
+ res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes,
+ vrm->uf_unmap, drop_lock);
+ vrm->vma = NULL; /* Invalidated. */
+ if (res)
+ return res;
/*
- * move_vma() need us to stay 4 maps below the threshold, otherwise
- * it will bail out at the very beginning.
- * That is a problem if we have already unmaped the regions here
- * (new_addr, and old_addr), because userspace will not know the
- * state of the vma's after it gets -ENOMEM.
- * So, to avoid such scenario we can pre-compute if the whole
- * operation has high chances to success map-wise.
- * Worst-scenario case is when both vma's (new_addr and old_addr) get
- * split in 3 before unmapping it.
- * That means 2 more maps (1 for each) to the ones we already hold.
- * Check whether current map count plus 2 still leads us to 4 maps below
- * the threshold, otherwise return -ENOMEM here to be more safe.
+ * If we've not dropped the lock, then we should reload the VMA to
+ * replace the invalidated VMA with the one that may have now been
+ * split.
*/
- if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
- return -ENOMEM;
+ if (drop_lock) {
+ vrm->mmap_locked = false;
+ } else {
+ vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vrm->vma)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * mremap_to() - remap a vma to a new location.
+ * Returns: The new address of the vma or an error.
+ */
+static unsigned long mremap_to(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long err;
+
+ /* Is the new length or address silly? */
+ if (vrm->new_len > TASK_SIZE ||
+ vrm->new_addr > TASK_SIZE - vrm->new_len)
+ return -EINVAL;
- if (flags & MREMAP_FIXED) {
+ if (vrm_overlaps(vrm))
+ return -EINVAL;
+
+ if (vrm->flags & MREMAP_FIXED) {
/*
* In mremap_to().
* VMA is moved to dst address, and munmap dst first.
* do_munmap will check if dst is sealed.
*/
- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
- if (ret)
- return ret;
- }
+ err = do_munmap(mm, vrm->new_addr, vrm->new_len,
+ vrm->uf_unmap_early);
+ vrm->vma = NULL; /* Invalidated. */
+ if (err)
+ return err;
- if (old_len > new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
- if (ret)
- return ret;
- old_len = new_len;
+ /*
+ * If we remap a portion of a VMA elsewhere in the same VMA,
+ * this can invalidate the old VMA. Reset.
+ */
+ vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vrm->vma)
+ return -EFAULT;
}
- vma = vma_lookup(mm, addr);
- if (!vma)
- return -EFAULT;
-
- ret = resize_is_valid(vma, addr, old_len, new_len, flags);
- if (ret)
- return ret;
+ if (vrm->remap_type == MREMAP_SHRINK) {
+ err = shrink_vma(vrm, /* drop_lock= */false);
+ if (err)
+ return err;
- /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
- if (flags & MREMAP_DONTUNMAP &&
- !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
- return -ENOMEM;
+ /* Set up for the move now shrink has been executed. */
+ vrm->old_len = vrm->new_len;
}
- if (flags & MREMAP_FIXED)
- map_flags |= MAP_FIXED;
+ err = resize_is_valid(vrm);
+ if (err)
+ return err;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
+ /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
+ if (vrm->flags & MREMAP_DONTUNMAP) {
+ vm_flags_t vm_flags = vrm->vma->vm_flags;
+ unsigned long pages = vrm->old_len >> PAGE_SHIFT;
- ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
- ((addr - vma->vm_start) >> PAGE_SHIFT),
- map_flags);
- if (IS_ERR_VALUE(ret))
- return ret;
+ if (!may_expand_vm(mm, vm_flags, pages))
+ return -ENOMEM;
+ }
- /* We got a new mapping */
- if (!(flags & MREMAP_FIXED))
- new_addr = ret;
+ err = vrm_set_new_addr(vrm);
+ if (err)
+ return err;
- return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags,
- uf, uf_unmap);
+ return move_vma(vrm);
}
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
@@ -1035,215 +1458,329 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
return 1;
}
+/* Determine whether we are actually able to execute an in-place expansion. */
+static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
+{
+ /* Number of bytes from vrm->addr to end of VMA. */
+ unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr;
+
+ /* If end of range aligns to end of VMA, we can just expand in-place. */
+ if (suffix_bytes != vrm->old_len)
+ return false;
+
+ /* Check whether this is feasible. */
+ if (!vma_expandable(vrm->vma, vrm->delta))
+ return false;
+
+ return true;
+}
+
/*
- * Expand (or shrink) an existing mapping, potentially moving it at the
- * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
- *
- * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
- * This option implies MREMAP_MAYMOVE.
+ * Are the parameters passed to mremap() valid? If so return 0, otherwise return
+ * error.
*/
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
- unsigned long, new_len, unsigned long, flags,
- unsigned long, new_addr)
+static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
+
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long ret = -EINVAL;
- bool locked = false;
- struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
- LIST_HEAD(uf_unmap_early);
- LIST_HEAD(uf_unmap);
+ unsigned long addr = vrm->addr;
+ unsigned long flags = vrm->flags;
+
+ /* Ensure no unexpected flag values. */
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
+ return -EINVAL;
+
+ /* Start address must be page-aligned. */
+ if (offset_in_page(addr))
+ return -EINVAL;
/*
- * There is a deliberate asymmetry here: we strip the pointer tag
- * from the old address but leave the new address alone. This is
- * for consistency with mmap(), where we prevent the creation of
- * aliasing mappings in userspace by leaving the tag bits of the
- * mapping address intact. A non-zero tag will cause the subsequent
- * range checks to reject the address as invalid.
- *
- * See Documentation/arch/arm64/tagged-address-abi.rst for more
- * information.
+ * We allow a zero old-len as a special case
+ * for DOS-emu "duplicate shm area" thing. But
+ * a zero new-len is nonsensical.
*/
- addr = untagged_addr(addr);
+ if (!PAGE_ALIGN(vrm->new_len))
+ return -EINVAL;
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
- return ret;
+ /* Remainder of checks are for cases with specific new_addr. */
+ if (!vrm_implies_new_addr(vrm))
+ return 0;
- if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
- return ret;
+ /* The new address must be page-aligned. */
+ if (offset_in_page(vrm->new_addr))
+ return -EINVAL;
+
+ /* A fixed address implies a move. */
+ if (!(flags & MREMAP_MAYMOVE))
+ return -EINVAL;
+
+ /* MREMAP_DONTUNMAP does not allow resizing in the process. */
+ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
+ return -EINVAL;
/*
- * MREMAP_DONTUNMAP is always a move and it does not allow resizing
- * in the process.
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmapping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
*/
- if (flags & MREMAP_DONTUNMAP &&
- (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
- return ret;
+ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+ return 0;
+}
- if (offset_in_page(addr))
- return ret;
+/*
+ * We know we can expand the VMA in-place by delta pages, so do so.
+ *
+ * If we discover the VMA is locked, update mm_struct statistics accordingly and
+ * indicate so to the caller.
+ */
+static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = vrm->vma;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
- old_len = PAGE_ALIGN(old_len);
- new_len = PAGE_ALIGN(new_len);
+ if (!vrm_charge(vrm))
+ return -ENOMEM;
/*
- * We allow a zero old-len as a special case
- * for DOS-emu "duplicate shm area" thing. But
- * a zero new-len is nonsensical.
+ * Function vma_merge_extend() is called on the
+ * extension we are adding to the already existing vma,
+ * vma_merge_extend() will merge this extension with the
+ * already existing vma (expand operation itself) and
+ * possibly also with the next vma if it becomes
+ * adjacent to the expanded vma and otherwise
+ * compatible.
*/
- if (!new_len)
- return ret;
-
- if (mmap_write_lock_killable(current->mm))
- return -EINTR;
- vma = vma_lookup(mm, addr);
+ vma = vma_merge_extend(&vmi, vma, vrm->delta);
if (!vma) {
- ret = -EFAULT;
- goto out;
+ vrm_uncharge(vrm);
+ return -ENOMEM;
}
+ vrm->vma = vma;
- /* Don't allow remapping vmas when they have already been sealed */
- if (!can_modify_vma(vma)) {
- ret = -EPERM;
- goto out;
- }
+ vrm_stat_account(vrm, vrm->delta);
+
+ return 0;
+}
+
+static bool align_hugetlb(struct vma_remap_struct *vrm)
+{
+ struct hstate *h __maybe_unused = hstate_vma(vrm->vma);
+
+ vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h));
+ vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h));
+
+ /* addrs must be huge page aligned */
+ if (vrm->addr & ~huge_page_mask(h))
+ return false;
+ if (vrm->new_addr & ~huge_page_mask(h))
+ return false;
+
+ /*
+ * Don't allow remap expansion, because the underlying hugetlb
+ * reservation is not yet capable to handle split reservation.
+ */
+ if (vrm->new_len > vrm->old_len)
+ return false;
+
+ vrm_set_delta(vrm);
+
+ return true;
+}
- if (is_vm_hugetlb_page(vma)) {
- struct hstate *h __maybe_unused = hstate_vma(vma);
+/*
+ * We are mremap()'ing without specifying a fixed address to move to, but are
+ * requesting that the VMA's size be increased.
+ *
+ * Try to do so in-place, if this fails, then move the VMA to a new location to
+ * action the change.
+ */
+static unsigned long expand_vma(struct vma_remap_struct *vrm)
+{
+ unsigned long err;
+ unsigned long addr = vrm->addr;
- old_len = ALIGN(old_len, huge_page_size(h));
- new_len = ALIGN(new_len, huge_page_size(h));
+ err = resize_is_valid(vrm);
+ if (err)
+ return err;
- /* addrs must be huge page aligned */
- if (addr & ~huge_page_mask(h))
- goto out;
- if (new_addr & ~huge_page_mask(h))
- goto out;
+ /*
+ * [addr, old_len) spans precisely to the end of the VMA, so try to
+ * expand it in-place.
+ */
+ if (vrm_can_expand_in_place(vrm)) {
+ err = expand_vma_in_place(vrm);
+ if (err)
+ return err;
/*
- * Don't allow remap expansion, because the underlying hugetlb
- * reservation is not yet capable to handle split reservation.
+ * We want to populate the newly expanded portion of the VMA to
+ * satisfy the expectation that mlock()'ing a VMA maintains all
+ * of its pages in memory.
*/
- if (new_len > old_len)
- goto out;
- }
+ if (vrm->mlocked)
+ vrm->new_addr = addr;
- if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
- ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, flags, &uf, &uf_unmap_early,
- &uf_unmap);
- goto out;
+ /* OK we're done! */
+ return addr;
}
/*
- * Always allow a shrinking remap: that just unmaps
- * the unnecessary pages..
- * do_vmi_munmap does all the needed commit accounting, and
- * unlocks the mmap_lock if so directed.
+ * We weren't able to just expand or shrink the area,
+ * we need to create a new one and move it.
*/
- if (old_len >= new_len) {
- VMA_ITERATOR(vmi, mm, addr + new_len);
- if (old_len == new_len) {
- ret = addr;
- goto out;
- }
+ /* We're not allowed to move the VMA, so error out. */
+ if (!(vrm->flags & MREMAP_MAYMOVE))
+ return -ENOMEM;
+
+ /* Find a new location to move the VMA to. */
+ err = vrm_set_new_addr(vrm);
+ if (err)
+ return err;
+
+ return move_vma(vrm);
+}
+
+/*
+ * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the
+ * first available address to perform the operation.
+ */
+static unsigned long mremap_at(struct vma_remap_struct *vrm)
+{
+ unsigned long res;
- ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
- &uf_unmap, true);
- if (ret)
- goto out;
+ switch (vrm->remap_type) {
+ case MREMAP_INVALID:
+ break;
+ case MREMAP_NO_RESIZE:
+ /* NO-OP CASE - resizing to the same size. */
+ return vrm->addr;
+ case MREMAP_SHRINK:
+ /*
+ * SHRINK CASE. Can always be done in-place.
+ *
+ * Simply unmap the shrunken portion of the VMA. This does all
+ * the needed commit accounting, and we indicate that the mmap
+ * lock should be dropped.
+ */
+ res = shrink_vma(vrm, /* drop_lock= */true);
+ if (res)
+ return res;
- ret = addr;
- goto out_unlocked;
+ return vrm->addr;
+ case MREMAP_EXPAND:
+ return expand_vma(vrm);
}
- /*
- * Ok, we need to grow..
- */
- ret = resize_is_valid(vma, addr, old_len, new_len, flags);
+ BUG();
+}
+
+static unsigned long do_mremap(struct vma_remap_struct *vrm)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret;
+
+ ret = check_mremap_params(vrm);
if (ret)
- goto out;
+ return ret;
- /* old_len exactly to the end of the area..
- */
- if (old_len == vma->vm_end - addr) {
- unsigned long delta = new_len - old_len;
-
- /* can we just expand the current mapping? */
- if (vma_expandable(vma, delta)) {
- long pages = delta >> PAGE_SHIFT;
- VMA_ITERATOR(vmi, mm, vma->vm_end);
- long charged = 0;
-
- if (vma->vm_flags & VM_ACCOUNT) {
- if (security_vm_enough_memory_mm(mm, pages)) {
- ret = -ENOMEM;
- goto out;
- }
- charged = pages;
- }
+ vrm->old_len = PAGE_ALIGN(vrm->old_len);
+ vrm->new_len = PAGE_ALIGN(vrm->new_len);
+ vrm_set_delta(vrm);
- /*
- * Function vma_merge_extend() is called on the
- * extension we are adding to the already existing vma,
- * vma_merge_extend() will merge this extension with the
- * already existing vma (expand operation itself) and
- * possibly also with the next vma if it becomes
- * adjacent to the expanded vma and otherwise
- * compatible.
- */
- vma = vma_merge_extend(&vmi, vma, delta);
- if (!vma) {
- vm_unacct_memory(charged);
- ret = -ENOMEM;
- goto out;
- }
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ vrm->mmap_locked = true;
- vm_stat_account(mm, vma->vm_flags, pages);
- if (vma->vm_flags & VM_LOCKED) {
- mm->locked_vm += pages;
- locked = true;
- new_addr = addr;
- }
- ret = addr;
- goto out;
- }
+ vma = vrm->vma = vma_lookup(mm, vrm->addr);
+ if (!vma) {
+ ret = -EFAULT;
+ goto out;
}
- /*
- * We weren't able to just expand or shrink the area,
- * we need to create a new one and move it..
- */
- ret = -ENOMEM;
- if (flags & MREMAP_MAYMOVE) {
- unsigned long map_flags = 0;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
-
- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
- vma->vm_pgoff +
- ((addr - vma->vm_start) >> PAGE_SHIFT),
- map_flags);
- if (IS_ERR_VALUE(new_addr)) {
- ret = new_addr;
- goto out;
- }
+ /* If mseal()'d, mremap() is prohibited. */
+ if (!can_modify_vma(vma)) {
+ ret = -EPERM;
+ goto out;
+ }
- ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, flags, &uf, &uf_unmap);
+ /* Align to hugetlb page size, if required. */
+ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) {
+ ret = -EINVAL;
+ goto out;
}
+
+ vrm->remap_type = vrm_remap_type(vrm);
+
+ /* Actually execute mremap. */
+ ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
+
out:
- if (offset_in_page(ret))
- locked = false;
- mmap_write_unlock(current->mm);
- if (locked && new_len > old_len)
- mm_populate(new_addr + old_len, new_len - old_len);
-out_unlocked:
- userfaultfd_unmap_complete(mm, &uf_unmap_early);
- mremap_userfaultfd_complete(&uf, addr, ret, old_len);
- userfaultfd_unmap_complete(mm, &uf_unmap);
+ if (vrm->mmap_locked) {
+ mmap_write_unlock(mm);
+ vrm->mmap_locked = false;
+
+ if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len)
+ mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
+ }
+
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
+ mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len);
+ userfaultfd_unmap_complete(mm, vrm->uf_unmap);
+
return ret;
}
+
+/*
+ * Expand (or shrink) an existing mapping, potentially moving it at the
+ * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
+ *
+ * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
+ * This option implies MREMAP_MAYMOVE.
+ */
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
+{
+ struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap_early);
+ LIST_HEAD(uf_unmap);
+ /*
+ * There is a deliberate asymmetry here: we strip the pointer tag
+ * from the old address but leave the new address alone. This is
+ * for consistency with mmap(), where we prevent the creation of
+ * aliasing mappings in userspace by leaving the tag bits of the
+ * mapping address intact. A non-zero tag will cause the subsequent
+ * range checks to reject the address as invalid.
+ *
+ * See Documentation/arch/arm64/tagged-address-abi.rst for more
+ * information.
+ */
+ struct vma_remap_struct vrm = {
+ .addr = untagged_addr(addr),
+ .old_len = old_len,
+ .new_len = new_len,
+ .flags = flags,
+ .new_addr = new_addr,
+
+ .uf = &uf,
+ .uf_unmap_early = &uf_unmap_early,
+ .uf_unmap = &uf_unmap,
+
+ .remap_type = MREMAP_INVALID, /* We set later. */
+ };
+
+ return do_mremap(&vrm);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index baa79abdaf03..617e7ba8022f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,18 +42,11 @@
#include <asm/mmu_context.h>
#include "internal.h"
-void *high_memory;
-EXPORT_SYMBOL(high_memory);
-struct page *mem_map;
-unsigned long max_mapnr;
-EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
static struct kmem_cache *vm_region_jar;
@@ -392,6 +385,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
return mm->brk = brk;
}
+static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+
+static const struct ctl_table nommu_table[] = {
+ {
+ .procname = "nr_trim_pages",
+ .data = &sysctl_nr_trim_pages,
+ .maxlen = sizeof(sysctl_nr_trim_pages),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
/*
* initialise the percpu counter for VM and region record slabs
*/
@@ -402,6 +408,7 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
+ register_sysctl_init("vm", nommu_table);
}
/*
@@ -1191,7 +1198,7 @@ share:
setup_vma_to_mm(vma, current->mm);
current->mm->map_count++;
/* add the VMA to the tree */
- vma_iter_store(&vmi, vma);
+ vma_iter_store_new(&vmi, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1356,7 +1363,7 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
setup_vma_to_mm(vma, mm);
setup_vma_to_mm(new, mm);
- vma_iter_store(vmi, new);
+ vma_iter_store_new(vmi, new);
mm->map_count++;
return 0;
@@ -1613,13 +1620,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
@@ -1708,6 +1708,85 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len)
+{
+ unsigned long addr_end;
+ struct vm_area_struct *vma;
+ int ret = -EFAULT;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return ret;
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ if (check_add_overflow(addr, len, &addr_end))
+ goto out;
+
+ /* don't overrun this mapping */
+ if (addr_end > vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read mappings where it is permitted */
+ if (vma->vm_flags & VM_MAYREAD) {
+ ret = strscpy(buf, (char *)addr, len);
+ if (ret < 0)
+ ret = len - 1;
+ }
+
+out:
+ mmap_read_unlock(mm);
+ return ret;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour (unused)
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
* @inode: The inode to check
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1cf121ad7085..25923cfec9c6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -563,7 +563,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm)
}
/*
- * Reaps the address space of the give task.
+ * Reaps the address space of the given task.
*
* Returns true on success and false if none or part of the address space
* has been reclaimed and the caller should retry later.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eb55ece39c56..c81624bc3969 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -120,29 +120,6 @@ EXPORT_SYMBOL(laptop_mode);
struct wb_domain global_wb_domain;
-/* consolidated parameters for balance_dirty_pages() and its subroutines */
-struct dirty_throttle_control {
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct wb_domain *dom;
- struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
-#endif
- struct bdi_writeback *wb;
- struct fprop_local_percpu *wb_completions;
-
- unsigned long avail; /* dirtyable */
- unsigned long dirty; /* file_dirty + write + nfs */
- unsigned long thresh; /* dirty threshold */
- unsigned long bg_thresh; /* dirty background threshold */
-
- unsigned long wb_dirty; /* per-wb counterparts */
- unsigned long wb_thresh;
- unsigned long wb_bg_thresh;
-
- unsigned long pos_ratio;
- bool freerun;
- bool dirty_exceeded;
-};
-
/*
* Length of period for aging writeout fractions of bdis. This is an
* arbitrarily chosen number. The longer the period, the slower fractions will
@@ -663,7 +640,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -1095,7 +1072,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
struct bdi_writeback *wb = dtc->wb;
unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
- unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
@@ -1962,11 +1939,7 @@ free_running:
*/
if (pause < min_pause) {
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -1991,11 +1964,7 @@ free_running:
pause:
trace_balance_dirty_pages(wb,
- sdtc->thresh,
- sdtc->bg_thresh,
- sdtc->dirty,
- sdtc->wb_thresh,
- sdtc->wb_dirty,
+ sdtc,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -2260,7 +2229,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
@@ -3109,6 +3078,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write)
int access_ret;
VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 579789600a3c..fd6b865cb1ab 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -273,6 +276,7 @@ int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
static int watermark_boost_factor __read_mostly = 15000;
static int watermark_scale_factor = 10;
+int defrag_mode;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -508,9 +512,9 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
- bool __maybe_unused movable;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool movable;
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
@@ -614,6 +618,10 @@ compaction_capture(struct capture_control *capc, struct page *page,
capc->cc->migratetype != MIGRATE_MOVABLE)
return false;
+ if (migratetype != capc->cc->migratetype)
+ trace_mm_page_alloc_extfrag(page, capc->cc->order, order,
+ capc->cc->migratetype, migratetype);
+
capc->page = page;
return true;
}
@@ -655,16 +663,20 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
bool tail)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ get_pageblock_migratetype(page), migratetype, nr_pages);
if (tail)
list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
else
list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
}
/*
@@ -676,24 +688,34 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int old_mt, int new_mt)
{
struct free_area *area = &zone->free_area[order];
+ int nr_pages = 1 << order;
/* Free page moving can fail, so it happens before the type update */
VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), old_mt, 1 << order);
+ get_pageblock_migratetype(page), old_mt, nr_pages);
list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
- account_freepages(zone, -(1 << order), old_mt);
- account_freepages(zone, 1 << order, new_mt);
+ account_freepages(zone, -nr_pages, old_mt);
+ account_freepages(zone, nr_pages, new_mt);
+
+ if (order >= pageblock_order &&
+ is_migrate_isolate(old_mt) != is_migrate_isolate(new_mt)) {
+ if (!is_migrate_isolate(old_mt))
+ nr_pages = -nr_pages;
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, nr_pages);
+ }
}
static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
+ int nr_pages = 1 << order;
+
VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
"page type is %lu, passed migratetype is %d (nr=%d)\n",
- get_pageblock_migratetype(page), migratetype, 1 << order);
+ get_pageblock_migratetype(page), migratetype, nr_pages);
/* clear reported state and update reported page count */
if (page_reported(page))
@@ -703,6 +725,9 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
+
+ if (order >= pageblock_order && !is_migrate_isolate(migratetype))
+ __mod_zone_page_state(zone, NR_FREE_PAGES_BLOCKS, -nr_pages);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -947,21 +972,34 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
switch (page - head_page) {
case 1:
/* the first tail page: these may be in place of ->mapping */
- if (unlikely(folio_entire_mapcount(folio))) {
- bad_page(page, "nonzero entire_mapcount");
- goto out;
- }
if (unlikely(folio_large_mapcount(folio))) {
bad_page(page, "nonzero large_mapcount");
goto out;
}
- if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
+ unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
}
- if (unlikely(atomic_read(&folio->_pincount))) {
- bad_page(page, "nonzero pincount");
- goto out;
+ if (IS_ENABLED(CONFIG_MM_ID)) {
+ if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+ bad_page(page, "nonzero mm mapcount 0");
+ goto out;
+ }
+ if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+ bad_page(page, "nonzero mm mapcount 1");
+ goto out;
+ }
+ }
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
}
break;
case 2:
@@ -970,7 +1008,22 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "on deferred list");
goto out;
}
+ if (!IS_ENABLED(CONFIG_64BIT)) {
+ if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
+ bad_page(page, "nonzero entire_mapcount");
+ goto out;
+ }
+ if (unlikely(atomic_read(&folio->_pincount))) {
+ bad_page(page, "nonzero pincount");
+ goto out;
+ }
+ }
break;
+ case 3:
+ /* the third tail page: hugetlb specifics overlap ->mappings */
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
+ break;
+ fallthrough;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
@@ -1041,6 +1094,84 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_sub(page, nr);
+}
+
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+{
+ struct alloc_tag *tag;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = __pgalloc_tag_get(page);
+ if (tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
@@ -1096,8 +1227,12 @@ __always_inline bool free_pages_prepare(struct page *page,
if (unlikely(order)) {
int i;
- if (compound)
+ if (compound) {
page[1].flags &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ folio->_nr_pages = 0;
+#endif
+ }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_page_prepare(page, page + i);
@@ -1249,13 +1384,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
} while (1);
}
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->order = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->order;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -1427,7 +1593,7 @@ static __always_inline void page_del_and_expand(struct zone *zone,
static void check_new_page_bad(struct page *page)
{
- if (unlikely(page->flags & __PG_HWPOISON)) {
+ if (unlikely(PageHWPoison(page))) {
/* Don't complain about hwpoisoned pages */
if (PageBuddy(page))
__ClearPageBuddy(page);
@@ -1825,47 +1991,6 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}
-/*
- * When we are falling back to another migratetype during allocation, try to
- * steal extra free pages from the same pageblocks to satisfy further
- * allocations, instead of polluting multiple pageblocks.
- *
- * If we are stealing a relatively large buddy page, it is likely there will
- * be more free pages in the pageblock, so try to steal them all. For
- * reclaimable and unmovable allocations, we steal regardless of page size,
- * as fragmentation caused by those allocations polluting movable pageblocks
- * is worse than movable allocations stealing from unmovable and reclaimable
- * pageblocks.
- */
-static bool can_steal_fallback(unsigned int order, int start_mt)
-{
- /*
- * Leaving this order check is intended, although there is
- * relaxed order check in next check. The reason is that
- * we can actually steal whole pageblock if this condition met,
- * but, below check doesn't guarantee it and that is just heuristic
- * so could be changed anytime.
- */
- if (order >= pageblock_order)
- return true;
-
- /*
- * Movable pages won't cause permanent fragmentation, so when you alloc
- * small pages, you just need to temporarily steal unmovable or
- * reclaimable pages that are closest to the request size. After a
- * while, memory compaction may occur to form large contiguous pages,
- * and the next movable allocation may not need to steal. Unmovable and
- * reclaimable allocations need to actually steal pages.
- */
- if (order >= pageblock_order / 2 ||
- start_mt == MIGRATE_RECLAIMABLE ||
- start_mt == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled)
- return true;
-
- return false;
-}
-
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
@@ -1904,30 +2029,99 @@ static inline bool boost_watermark(struct zone *zone)
}
/*
- * This function implements actual steal behaviour. If order is large enough, we
- * can claim the whole pageblock for the requested migratetype. If not, we check
- * the pageblock for constituent pages; if at least half of the pages are free
- * or compatible, we can still claim the whole block, so pages freed in the
- * future will be put on the correct free list. Otherwise, we isolate exactly
- * the order we need from the fallback block and leave its migratetype alone.
+ * When we are falling back to another migratetype during allocation, should we
+ * try to claim an entire block to satisfy further allocations, instead of
+ * polluting multiple pageblocks?
*/
-static struct page *
-steal_suitable_fallback(struct zone *zone, struct page *page,
- int current_order, int order, int start_type,
- unsigned int alloc_flags, bool whole_block)
+static bool should_try_claim_block(unsigned int order, int start_mt)
{
- int free_pages, movable_pages, alike_pages;
- unsigned long start_pfn;
- int block_type;
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually claim the whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
- block_type = get_pageblock_migratetype(page);
+ /*
+ * Above a certain threshold, always try to claim, as it's likely there
+ * will be more free pages in the pageblock.
+ */
+ if (order >= pageblock_order / 2)
+ return true;
/*
- * This can happen due to races and we want to prevent broken
- * highatomic accounting.
+ * Unmovable/reclaimable allocations would cause permanent
+ * fragmentations if they fell back to allocating from a movable block
+ * (polluting it), so we try to claim the whole block regardless of the
+ * allocation size. Later movable allocations can always steal from this
+ * block, which is less problematic.
*/
- if (is_migrate_highatomic(block_type))
- goto single_page;
+ if (start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE)
+ return true;
+
+ if (page_group_by_mobility_disabled)
+ return true;
+
+ /*
+ * Movable pages won't cause permanent fragmentation, so when you alloc
+ * small pages, we just need to temporarily steal unmovable or
+ * reclaimable pages that are closest to the request size. After a
+ * while, memory compaction may occur to form large contiguous pages,
+ * and the next movable allocation may not need to steal.
+ */
+ return false;
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * Sets *claim_block to instruct the caller whether it should convert a whole
+ * pageblock to the returned migratetype.
+ * If only_claim is true, this function returns fallback_mt only if
+ * we would do this whole-block claiming. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_claim, bool *claim_block)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *claim_block = false;
+ for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (free_area_empty(area, fallback_mt))
+ continue;
+
+ if (should_try_claim_block(order, migratetype))
+ *claim_block = true;
+
+ if (*claim_block || !only_claim)
+ return fallback_mt;
+ }
+
+ return -1;
+}
+
+/*
+ * This function implements actual block claiming behaviour. If order is large
+ * enough, we can claim the whole pageblock for the requested migratetype. If
+ * not, we check the pageblock for constituent pages; if at least half of the
+ * pages are free or compatible, we can still claim the whole block, so pages
+ * freed in the future will be put on the correct free list.
+ */
+static struct page *
+try_to_claim_block(struct zone *zone, struct page *page,
+ int current_order, int order, int start_type,
+ int block_type, unsigned int alloc_flags)
+{
+ int free_pages, movable_pages, alike_pages;
+ unsigned long start_pfn;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1948,14 +2142,10 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
- /* We are not allowed to try stealing from the whole block */
- if (!whole_block)
- goto single_page;
-
/* moving whole block can fail due to zone boundary conditions */
if (!prep_move_freepages_block(zone, page, &start_pfn, &free_pages,
&movable_pages))
- goto single_page;
+ return NULL;
/*
* Determine how many pages are compatible with our allocation.
@@ -1988,198 +2178,24 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
return __rmqueue_smallest(zone, order, start_type);
}
-single_page:
- page_del_and_expand(zone, page, order, current_order, block_type);
- return page;
-}
-
-/*
- * Check whether there is a suitable fallback freepage with requested order.
- * If only_stealable is true, this function returns fallback_mt only if
- * we can steal other freepages all together. This would help to reduce
- * fragmentation due to mixed migratetype pages in one pageblock.
- */
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal)
-{
- int i;
- int fallback_mt;
-
- if (area->nr_free == 0)
- return -1;
-
- *can_steal = false;
- for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) {
- fallback_mt = fallbacks[migratetype][i];
- if (free_area_empty(area, fallback_mt))
- continue;
-
- if (can_steal_fallback(order, migratetype))
- *can_steal = true;
-
- if (!only_stealable)
- return fallback_mt;
-
- if (*can_steal)
- return fallback_mt;
- }
-
- return -1;
-}
-
-/*
- * Reserve the pageblock(s) surrounding an allocation request for
- * exclusive use of high-order atomic allocations if there are no
- * empty page blocks that contain a page with a suitable order
- */
-static void reserve_highatomic_pageblock(struct page *page, int order,
- struct zone *zone)
-{
- int mt;
- unsigned long max_managed, flags;
-
- /*
- * The number reserved as: minimum is 1 pageblock, maximum is
- * roughly 1% of a zone. But if 1% of a zone falls below a
- * pageblock size, then don't reserve any pageblocks.
- * Check is race-prone but harmless.
- */
- if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
- return;
- max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
- if (zone->nr_reserved_highatomic >= max_managed)
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- /* Recheck the nr_reserved_highatomic limit under the lock */
- if (zone->nr_reserved_highatomic >= max_managed)
- goto out_unlock;
-
- /* Yoink! */
- mt = get_pageblock_migratetype(page);
- /* Only reserve normal pageblocks (i.e., they can merge with others) */
- if (!migratetype_is_mergeable(mt))
- goto out_unlock;
-
- if (order < pageblock_order) {
- if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
- goto out_unlock;
- zone->nr_reserved_highatomic += pageblock_nr_pages;
- } else {
- change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
- zone->nr_reserved_highatomic += 1 << order;
- }
-
-out_unlock:
- spin_unlock_irqrestore(&zone->lock, flags);
+ return NULL;
}
/*
- * Used when an allocation is about to fail under memory pressure. This
- * potentially hurts the reliability of high-order allocations when under
- * intense memory pressure but failed atomic allocations should be easier
- * to recover from than an OOM.
+ * Try finding a free buddy page on the fallback list.
*
- * If @force is true, try to unreserve pageblocks even though highatomic
- * pageblock is exhausted.
- */
-static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
- bool force)
-{
- struct zonelist *zonelist = ac->zonelist;
- unsigned long flags;
- struct zoneref *z;
- struct zone *zone;
- struct page *page;
- int order;
- int ret;
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
- ac->nodemask) {
- /*
- * Preserve at least one pageblock unless memory pressure
- * is really high.
- */
- if (!force && zone->nr_reserved_highatomic <=
- pageblock_nr_pages)
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < NR_PAGE_ORDERS; order++) {
- struct free_area *area = &(zone->free_area[order]);
- int mt;
-
- page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
- if (!page)
- continue;
-
- mt = get_pageblock_migratetype(page);
- /*
- * In page freeing path, migratetype change is racy so
- * we can counter several free pages in a pageblock
- * in this loop although we changed the pageblock type
- * from highatomic to ac->migratetype. So we should
- * adjust the count once.
- */
- if (is_migrate_highatomic(mt)) {
- unsigned long size;
- /*
- * It should never happen but changes to
- * locking could inadvertently allow a per-cpu
- * drain to add pages to MIGRATE_HIGHATOMIC
- * while unreserving so be safe and watch for
- * underflows.
- */
- size = max(pageblock_nr_pages, 1UL << order);
- size = min(size, zone->nr_reserved_highatomic);
- zone->nr_reserved_highatomic -= size;
- }
-
- /*
- * Convert to ac->migratetype and avoid the normal
- * pageblock stealing heuristics. Minimally, the caller
- * is doing the work and needs the pages. More
- * importantly, if the block was always converted to
- * MIGRATE_UNMOVABLE or another type then the number
- * of pageblocks that cannot be completely freed
- * may increase.
- */
- if (order < pageblock_order)
- ret = move_freepages_block(zone, page, mt,
- ac->migratetype);
- else {
- move_to_free_list(page, zone, order, mt,
- ac->migratetype);
- change_pageblock_range(page, order,
- ac->migratetype);
- ret = 1;
- }
- /*
- * Reserving the block(s) already succeeded,
- * so this should not fail on zone boundaries.
- */
- WARN_ON_ONCE(ret == -1);
- if (ret > 0) {
- spin_unlock_irqrestore(&zone->lock, flags);
- return ret;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-
- return false;
-}
-
-/*
- * Try finding a free buddy page on the fallback list and put it on the free
- * list of requested migratetype, possibly along with other pages from the same
- * block, depending on fragmentation avoidance heuristics. Returns true if
- * fallback was found so that __rmqueue_smallest() can grab it.
+ * This will attempt to claim a whole pageblock for the requested type
+ * to ensure grouping of such requests in the future.
+ *
+ * If a whole block cannot be claimed, steal an individual page, regressing to
+ * __rmqueue_smallest() logic to at least break up as little contiguity as
+ * possible.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
+ *
+ * Return the stolen page, or NULL if none can be found.
*/
static __always_inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
@@ -2190,7 +2206,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
int min_order = order;
struct page *page;
int fallback_mt;
- bool can_steal;
+ bool claim_block;
/*
* Do not steal pages from freelists belonging to other pageblocks
@@ -2209,49 +2225,40 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, false, &claim_block);
if (fallback_mt == -1)
continue;
- /*
- * We cannot steal all free pages from the pageblock and the
- * requested migratetype is movable. In that case it's better to
- * steal and split the smallest available page instead of the
- * largest available page, because even if the next movable
- * allocation falls back into a different pageblock than this
- * one, it won't cause permanent fragmentation.
- */
- if (!can_steal && start_migratetype == MIGRATE_MOVABLE
- && current_order > order)
- goto find_smallest;
+ if (!claim_block)
+ break;
- goto do_steal;
+ page = get_page_from_free_area(area, fallback_mt);
+ page = try_to_claim_block(zone, page, current_order, order,
+ start_migratetype, fallback_mt,
+ alloc_flags);
+ if (page)
+ goto got_one;
}
- return NULL;
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ return NULL;
-find_smallest:
+ /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
- if (fallback_mt != -1)
- break;
- }
-
- /*
- * This should not happen - we already found a suitable fallback
- * when looking for the largest page.
- */
- VM_BUG_ON(current_order > MAX_PAGE_ORDER);
+ start_migratetype, false, &claim_block);
+ if (fallback_mt == -1)
+ continue;
-do_steal:
- page = get_page_from_free_area(area, fallback_mt);
+ page = get_page_from_free_area(area, fallback_mt);
+ page_del_and_expand(zone, page, order, current_order, fallback_mt);
+ goto got_one;
+ }
- /* take off list, maybe claim block, expand remainder */
- page = steal_suitable_fallback(zone, page, current_order, order,
- start_migratetype, alloc_flags, can_steal);
+ return NULL;
+got_one:
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -2307,7 +2314,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
@@ -2595,7 +2606,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
static void free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
- unsigned int order)
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@@ -2630,6 +2641,14 @@ static void free_frozen_page_commit(struct zone *zone,
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return;
+ }
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2644,7 +2663,8 @@ static void free_frozen_page_commit(struct zone *zone,
/*
* Free a pcp page
*/
-void free_frozen_pages(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2653,7 +2673,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2671,23 +2691,33 @@ void free_frozen_pages(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_frozen_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
+}
+
/*
* Free a batch of folios
*/
@@ -2776,7 +2806,7 @@ void free_unref_folios(struct folio_batch *folios)
trace_mm_page_free_batched(&folio->page);
free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ order, FPI_NONE);
}
if (pcp) {
@@ -2805,7 +2835,7 @@ void split_page(struct page *page, unsigned int order)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2907,7 +2937,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@@ -3095,6 +3129,142 @@ out:
return page;
}
+/*
+ * Reserve the pageblock(s) surrounding an allocation request for
+ * exclusive use of high-order atomic allocations if there are no
+ * empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, int order,
+ struct zone *zone)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * The number reserved as: minimum is 1 pageblock, maximum is
+ * roughly 1% of a zone. But if 1% of a zone falls below a
+ * pageblock size, then don't reserve any pageblocks.
+ * Check is race-prone but harmless.
+ */
+ if ((zone_managed_pages(zone) / 100) < pageblock_nr_pages)
+ return;
+ max_managed = ALIGN((zone_managed_pages(zone) / 100), pageblock_nr_pages);
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (!migratetype_is_mergeable(mt))
+ goto out_unlock;
+
+ if (order < pageblock_order) {
+ if (move_freepages_block(zone, page, mt, MIGRATE_HIGHATOMIC) == -1)
+ goto out_unlock;
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ } else {
+ change_pageblock_range(page, order, MIGRATE_HIGHATOMIC);
+ zone->nr_reserved_highatomic += 1 << order;
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve pageblocks even though highatomic
+ * pageblock is exhausted.
+ */
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+ bool force)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+ int ret;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
+ ac->nodemask) {
+ /*
+ * Preserve at least one pageblock unless memory pressure
+ * is really high.
+ */
+ if (!force && zone->nr_reserved_highatomic <=
+ pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+ unsigned long size;
+
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
+ if (!page)
+ continue;
+
+ size = max(pageblock_nr_pages, 1UL << order);
+ /*
+ * It should never happen but changes to
+ * locking could inadvertently allow a per-cpu
+ * drain to add pages to MIGRATE_HIGHATOMIC
+ * while unreserving so be safe and watch for
+ * underflows.
+ */
+ if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
+ size = zone->nr_reserved_highatomic;
+ zone->nr_reserved_highatomic -= size;
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ if (order < pageblock_order)
+ ret = move_freepages_block(zone, page,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ else {
+ move_to_free_list(page, zone, order,
+ MIGRATE_HIGHATOMIC,
+ ac->migratetype);
+ change_pageblock_range(page, order,
+ ac->migratetype);
+ ret = 1;
+ }
+ /*
+ * Reserving the block(s) already succeeded,
+ * so this should not fail on zone boundaries.
+ */
+ WARN_ON_ONCE(ret == -1);
+ if (ret > 0) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ return false;
+}
+
static inline long __zone_watermark_unusable_free(struct zone *z,
unsigned int order, unsigned int alloc_flags)
{
@@ -3298,6 +3468,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
+ if (defrag_mode) {
+ alloc_flags |= ALLOC_NOFRAGMENT;
+ return alloc_flags;
+ }
+
#ifdef CONFIG_ZONE_DMA32
if (!zone)
return alloc_flags;
@@ -3389,7 +3564,7 @@ retry:
continue;
}
- if (no_fallback && nr_online_nodes > 1 &&
+ if (no_fallback && !defrag_mode && nr_online_nodes > 1 &&
zone != zonelist_zone(ac->preferred_zoneref)) {
int local_nid;
@@ -3500,7 +3675,7 @@ try_this_zone:
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
- if (no_fallback) {
+ if (no_fallback && !defrag_mode) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
@@ -3979,15 +4154,21 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
+ unsigned int reclaim_order;
+
+ if (defrag_mode)
+ reclaim_order = max(order, pageblock_order);
+ else
+ reclaim_order = order;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (!managed_zone(zone))
continue;
- if (last_pgdat != zone->zone_pgdat) {
- wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
- }
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ wakeup_kswapd(zone, gfp_mask, reclaim_order, highest_zoneidx);
+ last_pgdat = zone->zone_pgdat;
}
}
@@ -4037,6 +4218,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+ if (defrag_mode)
+ alloc_flags |= ALLOC_NOFRAGMENT;
+
return alloc_flags;
}
@@ -4243,6 +4427,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -4418,6 +4603,11 @@ retry:
&compaction_retries))
goto retry;
+ /* Reclaim/compaction failed to prevent the fallback */
+ if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
/*
* Deal with possible cpuset update races or zonelist updates to avoid
@@ -4511,7 +4701,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4809,9 +5004,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
- * __free_pages - Free pages allocated with alloc_pages().
+ * ___free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
+ * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -4828,22 +5024,36 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-void __free_pages(struct page *page, unsigned int order)
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
- struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
- free_frozen_pages(page, order);
+ __free_frozen_pages(page, order, fpi_flags);
else if (!head) {
- pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ pgalloc_tag_sub_pages(page, (1 << order) - 1);
while (order-- > 0)
- free_frozen_pages(page + (1 << order), order);
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
}
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_NONE);
+}
EXPORT_SYMBOL(__free_pages);
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from try_alloc_pages)
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_TRYLOCK);
+}
+
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -4864,7 +5074,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
split_page_owner(page, order, 0);
pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order, 0);
+ split_page_memcg(page, order);
while (page < --last)
set_page_refcounted(last);
@@ -5827,6 +6037,7 @@ static void calculate_totalreserve_pages(void)
}
}
totalreserve_pages = reserve_pages;
+ trace_mm_calculate_totalreserve_pages(totalreserve_pages);
}
/*
@@ -5849,14 +6060,15 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
+ trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
+ zone->lowmem_reserve[j]);
}
}
}
@@ -5920,6 +6132,7 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
+ trace_mm_setup_per_zone_wmarks(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6193,6 +6406,15 @@ static const struct ctl_table page_alloc_sysctl_table[] = {
.extra2 = SYSCTL_THREE_THOUSAND,
},
{
+ .procname = "defrag_mode",
+ .data = &defrag_mode,
+ .maxlen = sizeof(defrag_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "percpu_pagelist_high_fraction",
.data = &percpu_pagelist_high_fraction,
.maxlen = sizeof(percpu_pagelist_high_fraction),
@@ -7004,7 +7226,7 @@ static inline bool has_unaccepted_memory(void)
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
if (!has_unaccepted_memory())
@@ -7013,8 +7235,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
if (list_empty(&zone->unaccepted_pages))
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
@@ -7071,3 +7303,97 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+/**
+ * try_alloc_pages - opportunistic reentrant allocation from any context
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> try_alloc_pages_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure.
+ */
+struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that try_alloc_pages()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
+ | __GFP_ACCOUNT;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (has_unaccepted_memory())
+ return NULL;
+#endif
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (page)
+ set_page_refcounted(page);
+
+ if (memcg_kmem_online() && page &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ free_pages_nolock(page, order);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index af23f927611b..661e0f2a5127 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -121,6 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
{
struct page_counter *c;
bool protection = track_protection(counter);
+ bool track_failcnt = counter->track_failcnt;
for (c = counter; c; c = c->parent) {
long new;
@@ -146,7 +147,8 @@ bool page_counter_try_charge(struct page_counter *counter,
* inaccuracy in the failcnt which is only used
* to report stats.
*/
- data_race(c->failcnt++);
+ if (track_failcnt)
+ data_race(c->failcnt++);
*fail = c;
goto failed;
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 641d93f6af4c..c351fdfe9e9a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -508,6 +508,19 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
#endif
/**
+ * page_ext_lookup() - Lookup a page extension for a PFN.
+ * @pfn: PFN of the page we're interested in.
+ *
+ * Must be called with RCU read lock taken and @pfn must be valid.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ */
+struct page_ext *page_ext_lookup(unsigned long pfn)
+{
+ return lookup_page_ext(pfn_to_page(pfn));
+}
+
+/**
* page_ext_get() - Get the extended information for a page.
* @page: The page we're interested in.
*
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 947c7c7a3728..408aaf29a3ea 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -62,9 +62,14 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
/*
* For PTE-mapped THP, one sub page is referenced,
* the whole THP is referenced.
+ *
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are "old" from a CPU perspective.
+ * The MMU notifier takes care of any device aspects.
*/
- if (ptep_clear_young_notify(vma, addr, pvmw.pte))
- referenced = true;
+ if (likely(pte_present(ptep_get(pvmw.pte))))
+ referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte);
+ referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
referenced = true;
diff --git a/mm/page_io.c b/mm/page_io.c
index 9b983de351f9..4bce19df557b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -638,11 +638,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
if (swap_read_folio_zeromap(folio)) {
folio_unlock(folio);
goto finish;
- } else if (zswap_load(folio)) {
- folio_unlock(folio);
- goto finish;
}
+ if (zswap_load(folio) != -ENOENT)
+ goto finish;
+
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c608e9d72865..b2fc5266e3d2 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
unsigned int skip_pages;
if (PageHuge(page)) {
- if (!hugepage_migration_supported(folio_hstate(folio)))
+ struct hstate *h;
+
+ /*
+ * The huge page may be freed so can not
+ * use folio_hstate() directly.
+ */
+ h = size_to_hstate(folio_size(folio));
+ if (h && !hugepage_migration_supported(h))
return page;
} else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) {
return page;
@@ -608,6 +615,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 2d6360eaccbb..cc4a6916eec6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -229,17 +229,19 @@ static void dec_stack_record_count(depot_stack_handle_t handle,
handle);
}
-static inline void __update_page_owner_handle(struct page_ext *page_ext,
+static inline void __update_page_owner_handle(struct page *page,
depot_stack_handle_t handle,
unsigned short order,
gfp_t gfp_mask,
short last_migrate_reason, u64 ts_nsec,
pid_t pid, pid_t tgid, char *comm)
{
- int i;
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- for (i = 0; i < (1 << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
page_owner->handle = handle;
page_owner->order = order;
@@ -252,20 +254,22 @@ static inline void __update_page_owner_handle(struct page_ext *page_ext,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
- page_ext = page_ext_next(page_ext);
}
+ rcu_read_unlock();
}
-static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
+static inline void __update_page_owner_free_handle(struct page *page,
depot_stack_handle_t handle,
unsigned short order,
pid_t pid, pid_t tgid,
u64 free_ts_nsec)
{
- int i;
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- for (i = 0; i < (1 << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
/* Only __reset_page_owner() wants to clear the bit */
if (handle) {
@@ -275,8 +279,8 @@ static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
page_owner->free_ts_nsec = free_ts_nsec;
page_owner->free_pid = current->pid;
page_owner->free_tgid = current->tgid;
- page_ext = page_ext_next(page_ext);
}
+ rcu_read_unlock();
}
void __reset_page_owner(struct page *page, unsigned short order)
@@ -293,11 +297,17 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner = get_page_owner(page_ext);
alloc_handle = page_owner->handle;
+ page_ext_put(page_ext);
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
- __update_page_owner_free_handle(page_ext, handle, order, current->pid,
+ /*
+ * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
+ * to prevent issues in stack_depot_save().
+ * This is similar to try_alloc_pages() gfp flags, but only used
+ * to signal stack_depot to avoid spin_locks.
+ */
+ handle = save_stack(__GFP_NOWARN);
+ __update_page_owner_free_handle(page, handle, order, current->pid,
current->tgid, free_ts_nsec);
- page_ext_put(page_ext);
if (alloc_handle != early_handle)
/*
@@ -313,19 +323,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext;
u64 ts_nsec = local_clock();
depot_stack_handle_t handle;
handle = save_stack(gfp_mask);
-
- page_ext = page_ext_get(page);
- if (unlikely(!page_ext))
- return;
- __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
+ __update_page_owner_handle(page, handle, order, gfp_mask, -1,
ts_nsec, current->pid, current->tgid,
current->comm);
- page_ext_put(page_ext);
inc_stack_record_count(handle, gfp_mask, 1 << order);
}
@@ -344,44 +348,42 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
void __split_page_owner(struct page *page, int old_order, int new_order)
{
- int i;
- struct page_ext *page_ext = page_ext_get(page);
+ struct page_ext_iter iter;
+ struct page_ext *page_ext;
struct page_owner *page_owner;
- if (unlikely(!page_ext))
- return;
-
- for (i = 0; i < (1 << old_order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << old_order, page_ext, iter) {
page_owner = get_page_owner(page_ext);
page_owner->order = new_order;
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- int i;
- struct page_ext *old_ext;
- struct page_ext *new_ext;
+ struct page_ext *page_ext;
+ struct page_ext_iter iter;
struct page_owner *old_page_owner;
struct page_owner *new_page_owner;
depot_stack_handle_t migrate_handle;
- old_ext = page_ext_get(&old->page);
- if (unlikely(!old_ext))
+ page_ext = page_ext_get(&old->page);
+ if (unlikely(!page_ext))
return;
- new_ext = page_ext_get(&newfolio->page);
- if (unlikely(!new_ext)) {
- page_ext_put(old_ext);
+ old_page_owner = get_page_owner(page_ext);
+ page_ext_put(page_ext);
+
+ page_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!page_ext))
return;
- }
- old_page_owner = get_page_owner(old_ext);
- new_page_owner = get_page_owner(new_ext);
+ new_page_owner = get_page_owner(page_ext);
+ page_ext_put(page_ext);
+
migrate_handle = new_page_owner->handle;
- __update_page_owner_handle(new_ext, old_page_owner->handle,
+ __update_page_owner_handle(&newfolio->page, old_page_owner->handle,
old_page_owner->order, old_page_owner->gfp_mask,
old_page_owner->last_migrate_reason,
old_page_owner->ts_nsec, old_page_owner->pid,
@@ -391,7 +393,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
* will be freed after migration. Keep them until then as they may be
* useful.
*/
- __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
+ __update_page_owner_free_handle(&newfolio->page, 0, old_page_owner->order,
old_page_owner->free_pid,
old_page_owner->free_tgid,
old_page_owner->free_ts_nsec);
@@ -400,14 +402,12 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
* for the new one and the old folio otherwise there will be an imbalance
* when subtracting those pages from the stack.
*/
- for (i = 0; i < (1 << new_page_owner->order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(&old->page, 1 << new_page_owner->order, page_ext, iter) {
+ old_page_owner = get_page_owner(page_ext);
old_page_owner->handle = migrate_handle;
- old_ext = page_ext_next(old_ext);
- old_page_owner = get_page_owner(old_ext);
}
-
- page_ext_put(new_ext);
- page_ext_put(old_ext);
+ rcu_read_unlock();
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -507,7 +507,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
rcu_read_lock();
memcg_data = READ_ONCE(page->memcg_data);
- if (!memcg_data)
+ if (!memcg_data || PageTail(page))
goto out_unlock;
if (memcg_data & MEMCG_DATA_OBJEXTS)
@@ -813,7 +813,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
goto ext_put_continue;
/* Found early allocated page */
- __update_page_owner_handle(page_ext, early_handle, 0, 0,
+ __update_page_owner_handle(page, early_handle, 0, 0,
-1, local_clock(), current->pid,
current->tgid, current->comm);
count++;
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 509c6ef8de40..68109ee93841 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -62,24 +62,20 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
*/
static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
struct page *page;
- unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
BUG_ON(PageSlab(page));
anon = PageAnon(page);
- for (i = 0; i < pgcnt; i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, pgcnt, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
@@ -89,9 +85,8 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
}
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
/*
@@ -102,24 +97,20 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
bool rw)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
struct page *page;
- unsigned long i;
bool anon;
if (!pfn_valid(pfn))
return;
page = pfn_to_page(pfn);
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
BUG_ON(PageSlab(page));
anon = PageAnon(page);
- for (i = 0; i < pgcnt; i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, pgcnt, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
if (anon) {
@@ -129,9 +120,8 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
}
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
/*
@@ -140,24 +130,19 @@ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
+ struct page_ext_iter iter;
struct page_ext *page_ext;
- unsigned long i;
BUG_ON(PageSlab(page));
- page_ext = page_ext_get(page);
-
- if (!page_ext)
- return;
-
- for (i = 0; i < (1ul << order); i++) {
+ rcu_read_lock();
+ for_each_page_ext(page, 1 << order, page_ext, iter) {
struct page_table_check *ptc = get_page_table_check(page_ext);
BUG_ON(atomic_read(&ptc->anon_map_count));
BUG_ON(atomic_read(&ptc->file_map_count));
- page_ext = page_ext_next(page_ext);
}
- page_ext_put(page_ext);
+ rcu_read_unlock();
}
void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
@@ -196,9 +181,8 @@ EXPORT_SYMBOL(__page_table_check_pud_clear);
/* Whether the swap entry cached writable information */
static inline bool swap_cached_writable(swp_entry_t entry)
{
- return is_writable_device_exclusive_entry(entry) ||
- is_writable_device_private_entry(entry) ||
- is_writable_migration_entry(entry);
+ return is_writable_device_private_entry(entry) ||
+ is_writable_migration_entry(entry);
}
static inline void page_table_check_pte_flags(pte_t pte)
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 81839a9e74f1..e463c3be934a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -84,6 +84,7 @@ again:
* mapped at the @pvmw->pte
* @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
* for checking
+ * @pte_nr: the number of small pages described by @pvmw->pte.
*
* page_vma_mapped_walk() found a place where pfn range is *potentially*
* mapped. check_pte() has to validate this.
@@ -100,7 +101,7 @@ again:
* Otherwise, return false.
*
*/
-static bool check_pte(struct page_vma_mapped_walk *pvmw)
+static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
{
unsigned long pfn;
pte_t ptent = ptep_get(pvmw->pte);
@@ -111,8 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return false;
entry = pte_to_swp_entry(ptent);
- if (!is_migration_entry(entry) &&
- !is_device_exclusive_entry(entry))
+ if (!is_migration_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
@@ -133,7 +133,11 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(ptent);
}
- return (pfn - pvmw->pfn) < pvmw->nr_pages;
+ if ((pfn + pte_nr - 1) < pvmw->pfn)
+ return false;
+ if (pfn > (pvmw->pfn + pvmw->nr_pages - 1))
+ return false;
+ return true;
}
/* Returns true if the two ranges overlap. Careful to not overflow. */
@@ -208,7 +212,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return false;
pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
- if (!check_pte(pvmw))
+ if (!check_pte(pvmw, pages_per_huge_page(hstate)))
return not_found(pvmw);
return true;
}
@@ -291,7 +295,7 @@ restart:
goto next_pte;
}
this_pte:
- if (check_pte(pvmw))
+ if (check_pte(pvmw, 1))
return true;
next_pte:
do {
diff --git a/mm/percpu.c b/mm/percpu.c
index ac61e3fc5f15..b35494c8ede2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1745,7 +1745,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
gfp = current_gfp_context(gfp);
/* whitelisted flags that can be passed to the backing allocators */
pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
- is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+ is_atomic = !gfpflags_allow_blocking(gfp);
do_warn = !(gfp & __GFP_NOWARN);
/*
@@ -2191,7 +2191,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
* to grow other chunks. This then gives pcpu_reclaim_populated() time
* to move fully free chunks to the active list to be freed if
* appropriate.
+ *
+ * Enforce GFP_NOIO allocations because we have pcpu_alloc users
+ * constrained to GFP_NOIO/NOFS contexts and they could form lock
+ * dependency through pcpu_alloc_mutex
*/
+ unsigned int flags = memalloc_noio_save();
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
@@ -2202,6 +2207,7 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
+ memalloc_noio_restore(flags);
}
/**
@@ -3071,7 +3077,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3240,7 +3246,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
diff --git a/mm/readahead.c b/mm/readahead.c
index 220155a5c964..6a4e96b69702 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,7 +128,6 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include "internal.h"
@@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl,
pgoff_t prev_index, miss;
/*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't find 0 filled pages in cache that we never emitted
- * events for. Filesystems supporting HSM must make sure to not call
- * this function with ractl->file unset for files handled by HSM.
- */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
- /*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
@@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
- /* See the comment in page_cache_sync_ra. */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index c6c4d4ea29a7..67bb273dfb80 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -672,7 +672,7 @@ void try_to_unmap_flush_dirty(void)
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -681,7 +681,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
tlb_ubc->flush_required = true;
/*
@@ -757,7 +757,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
}
@@ -889,7 +889,7 @@ static bool folio_referenced_one(struct folio *folio,
if ((!atomic_read(&vma->vm_mm->mm_users) ||
check_stable_address_space(vma->vm_mm)) &&
folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_likely_mapped_shared(folio)) {
+ !folio_maybe_mapped_shared(folio)) {
pra->referenced = -1;
page_vma_mapped_walk_done(&pvmw);
return false;
@@ -1044,6 +1044,14 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
pte_t entry = ptep_get(pte);
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are clean and not writable from a
+ * CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (!pte_present(entry))
+ continue;
if (!pte_dirty(entry) && !pte_write(entry))
continue;
@@ -1127,6 +1135,80 @@ int folio_mkclean(struct folio *folio)
}
EXPORT_SYMBOL_GPL(folio_mkclean);
+struct wrprotect_file_state {
+ int cleaned;
+ pgoff_t pgoff;
+ unsigned long pfn;
+ unsigned long nr_pages;
+};
+
+static bool mapping_wrprotect_range_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
+{
+ struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = state->pfn,
+ .nr_pages = state->nr_pages,
+ .pgoff = state->pgoff,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
+
+ state->cleaned += page_vma_mkclean_one(&pvmw);
+
+ return true;
+}
+
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked);
+
+/**
+ * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
+ *
+ * @mapping: The mapping whose reverse mapping should be traversed.
+ * @pgoff: The page offset at which @pfn is mapped within @mapping.
+ * @pfn: The PFN of the page mapped in @mapping at @pgoff.
+ * @nr_pages: The number of physically contiguous base pages spanned.
+ *
+ * Traverses the reverse mapping, finding all VMAs which contain a shared
+ * mapping of the pages in the specified range in @mapping, and write-protects
+ * them (that is, updates the page tables to mark the mappings read-only such
+ * that a write protection fault arises when the mappings are written to).
+ *
+ * The @pfn value need not refer to a folio, but rather can reference a kernel
+ * allocation which is mapped into userland. We therefore do not require that
+ * the page maps to a folio with a valid mapping or index field, rather the
+ * caller specifies these in @mapping and @pgoff.
+ *
+ * Return: the number of write-protected PTEs, or an error.
+ */
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+ unsigned long pfn, unsigned long nr_pages)
+{
+ struct wrprotect_file_state state = {
+ .cleaned = 0,
+ .pgoff = pgoff,
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&state,
+ .rmap_one = mapping_wrprotect_range_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
+
+ if (!mapping)
+ return 0;
+
+ __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
+ /* locked = */false);
+
+ return state.cleaned;
+}
+EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
+
/**
* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
@@ -1160,8 +1242,8 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
}
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level,
- int *nr_pmdmapped)
+ struct page *page, int nr_pages, struct vm_area_struct *vma,
+ enum rmap_level level, int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
@@ -1176,6 +1258,16 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
break;
}
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
+ if (nr == orig_nr_pages)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
do {
first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1184,15 +1276,34 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
nr = first;
- atomic_add(orig_nr_pages, &folio->_large_mapcount);
+ folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ if (level == RMAP_LEVEL_PMD && first)
+ *nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_inc_return_large_mapcount(folio, vma);
+ if (nr == 1)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
- *nr_pmdmapped = folio_nr_pages(folio);
- nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ /*
+ * We only track PMD mappings of PMD-sized
+ * folios separately.
+ */
+ if (level == RMAP_LEVEL_PMD)
+ *nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1201,7 +1312,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
- atomic_inc(&folio->_large_mapcount);
+ folio_inc_large_mapcount(folio, vma);
break;
}
return nr;
@@ -1322,7 +1433,7 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
@@ -1338,15 +1449,32 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
case RMAP_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Keep the compiler happy, we don't support anonymous
+ * PUD mappings.
+ */
+ WARN_ON_ONCE(1);
+ break;
}
}
+
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
+ atomic_read(&folio->_mapcount) > 0, folio);
for (i = 0; i < nr_pages; i++) {
struct page *cur_page = page + i;
- /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
- VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
- (folio_test_large(folio) &&
- folio_entire_mapcount(folio) > 1)) &&
+ VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+ folio_entire_mapcount(folio) > 1 &&
+ PageAnonExclusive(cur_page), folio);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ continue;
+
+ /*
+ * While PTE-mapping a THP we have a PMD and a PTE
+ * mapping.
+ */
+ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
PageAnonExclusive(cur_page), folio);
}
@@ -1426,14 +1554,11 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, rmap_t flags)
{
- const int nr = folio_nr_pages(folio);
const bool exclusive = flags & RMAP_EXCLUSIVE;
- int nr_pmdmapped = 0;
+ int nr = 1, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
- VM_BUG_ON_VMA(address < vma->vm_start ||
- address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
/*
* VM_DROPPABLE mappings don't swap; instead they're just dropped when
@@ -1451,29 +1576,35 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
} else if (!folio_test_pmd_mappable(folio)) {
int i;
+ nr = folio_large_nr_pages(folio);
for (i = 0; i < nr; i++) {
struct page *page = folio_page(folio, i);
- /* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
if (exclusive)
SetPageAnonExclusive(page);
}
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, nr - 1);
- atomic_set(&folio->_nr_pages_mapped, nr);
+ folio_set_large_mapcount(folio, nr, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, nr);
} else {
+ nr = folio_large_nr_pages(folio);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
- /* increment count (starts at -1) */
- atomic_set(&folio->_large_mapcount, 0);
- atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
+ folio_set_large_mapcount(folio, 1, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
if (exclusive)
SetPageAnonExclusive(&folio->page);
nr_pmdmapped = nr;
}
+ VM_WARN_ON_ONCE(address < vma->vm_start ||
+ address + (nr << PAGE_SHIFT) > vma->vm_end);
+
__folio_mod_stat(folio, nr, nr_pmdmapped);
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
@@ -1486,7 +1617,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
__folio_mod_stat(folio, nr, nr_pmdmapped);
/* See comments in folio_add_anon_rmap_*() */
@@ -1531,6 +1662,27 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @vma: The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
@@ -1548,7 +1700,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
break;
}
- atomic_sub(nr_pages, &folio->_large_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_nr_pages(folio);
+ } else {
+ partially_mapped = nr < folio_large_nr_pages(folio) &&
+ !folio_entire_mapcount(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_sub_large_mapcount(folio, nr_pages, vma);
do {
last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
@@ -1560,13 +1725,32 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
- atomic_dec(&folio->_large_mapcount);
+ case RMAP_LEVEL_PUD:
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
+ if (level == RMAP_LEVEL_PMD && last)
+ nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_dec_return_large_mapcount(folio, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ } else {
+ partially_mapped = last &&
+ nr < folio_large_nr_pages(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_dec_large_mapcount(folio, vma);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
- nr_pmdmapped = folio_nr_pages(folio);
- nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ if (level == RMAP_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1640,6 +1824,46 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio: The folio to remove the mapping from
+ * @page: The first page to remove
+ * @vma: The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
+/* We support batch unmapping of PTEs for lazyfree large folios */
+static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
+ struct folio *folio, pte_t *ptep)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = folio_nr_pages(folio);
+ pte_t pte = ptep_get(ptep);
+
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return false;
+ if (pte_unused(pte))
+ return false;
+ if (pte_pfn(pte) != folio_pfn(folio))
+ return false;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ NULL, NULL) == max_nr;
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1648,11 +1872,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1702,9 +1927,16 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
if (!pvmw.pte) {
- if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd,
- folio))
- goto walk_done;
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
+ if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
+ goto walk_done;
+ /*
+ * unmap_huge_pmd_locked has either already marked
+ * the folio as swap-backed or decided to retain it
+ * due to GUP or speculative references.
+ */
+ goto walk_abort;
+ }
if (flags & TTU_SPLIT_HUGE_PMD) {
/*
@@ -1722,7 +1954,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
+ } else {
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+ }
+
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1778,24 +2021,33 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else if (likely(pte_present(pteval))) {
+ if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
+ can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
+ nr_pages = folio_nr_pages(folio);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
}
/*
@@ -1805,10 +2057,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
@@ -1822,8 +2070,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -1868,40 +2116,41 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
smp_rmb();
- /*
- * The only page refs must be one from isolation
- * plus the rmap(s) (dropped by discard:).
- */
- if (ref_count == 1 + map_count &&
- (!folio_test_dirty(folio) ||
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE
- * ones can be dropped even if they've
- * been dirtied.
- */
- (vma->vm_flags & VM_DROPPABLE))) {
- dec_mm_counter(mm, MM_ANONPAGES);
- goto discard;
- }
-
- /*
- * If the folio was redirtied, it cannot be
- * discarded. Remap the page to page table.
- */
- set_pte_at(mm, address, pvmw.pte, pteval);
- /*
- * Unlike MADV_FREE mappings, VM_DROPPABLE ones
- * never get swap backed on failure to drop.
- */
- if (!(vma->vm_flags & VM_DROPPABLE))
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ /*
+ * redirtied either using the page table or a previously
+ * obtained GUP reference.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
folio_set_swapbacked(folio);
- goto walk_abort;
+ goto walk_abort;
+ } else if (ref_count != 1 + map_count) {
+ /*
+ * Additional reference. Could be a GUP reference or any
+ * speculative reference. GUP users must mark the folio
+ * dirty if there was a modification. This folio cannot be
+ * reclaimed right now either way, so act just like nothing
+ * happened.
+ * We'll come back here later and detect if the folio was
+ * dirtied when the additional reference is gone.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
+ goto walk_abort;
+ }
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
+ goto discard;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
+
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1926,10 +2175,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (anon_exclusive)
swp_pte = pte_swp_mkexclusive(swp_pte);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
/*
@@ -1946,13 +2202,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ folio_ref_sub(folio, nr_pages - 1);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
folio_put(folio);
+ /* We have already batched the entire folio */
+ if (nr_pages > 1)
+ goto walk_done;
continue;
walk_abort:
ret = false;
@@ -2013,9 +2274,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
@@ -2082,24 +2343,19 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
-
- if (folio_is_zone_device(folio)) {
- /*
- * Our PTE is a non-present device exclusive entry and
- * calculating the subpage as for the common case would
- * result in an invalid pointer.
- *
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
- subpage = &folio->page;
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
} else {
- subpage = folio_page(folio, pfn - folio_pfn(folio));
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -2155,7 +2411,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else if (likely(pte_present(pteval))) {
flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
@@ -2169,58 +2428,27 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval, address);
+ set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
+ writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
}
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
+ VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
+ !anon_exclusive, folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_device_private(folio)) {
- unsigned long pfn = folio_pfn(folio);
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (anon_exclusive)
- WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
- subpage));
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = pte_to_swp_entry(pteval);
- if (is_writable_device_private_entry(entry))
- entry = make_writable_migration_entry(pfn);
- else if (anon_exclusive)
- entry = make_readable_exclusive_migration_entry(pfn);
- else
- entry = make_readable_migration_entry(pfn);
- swp_pte = swp_entry_to_pte(entry);
+ if (PageHWPoison(subpage)) {
+ VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- folio_order(folio));
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
- } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -2230,8 +2458,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -2247,6 +2475,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte,
@@ -2257,8 +2490,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
- !anon_exclusive, subpage);
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
if (folio_test_hugetlb(folio)) {
@@ -2283,7 +2514,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- if (pte_write(pteval))
+ if (writable)
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else if (anon_exclusive)
@@ -2292,15 +2523,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
- if (pte_young(pteval))
- entry = make_migration_entry_young(entry);
- if (pte_dirty(pteval))
- entry = make_migration_entry_dirty(entry);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
hsz);
@@ -2375,190 +2614,139 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
}
#ifdef CONFIG_DEVICE_PRIVATE
-struct make_exclusive_args {
- struct mm_struct *mm;
- unsigned long address;
- void *owner;
- bool valid;
-};
-
-static bool page_make_device_exclusive_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long address, void *priv)
+/**
+ * make_device_exclusive() - Mark a page for exclusive use by a device
+ * @mm: mm_struct of associated target process
+ * @addr: the virtual address to mark for exclusive device access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive PFN swap entry, preventing access through the process
+ * page tables. The function will return with the folio locked and referenced.
+ *
+ * On fault, the device-exclusive entries are replaced with the original PTE
+ * under folio lock, after calling MMU notifiers.
+ *
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the folio lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ *
+ * Notes:
+ * #. This function always operates on individual PTEs mapping individual
+ * pages. PMD-sized THPs are first remapped to be mapped by PTEs before
+ * the conversion happens on a single PTE corresponding to @addr.
+ * #. While concurrent access through the process page tables is prevented,
+ * concurrent access through other page references (e.g., earlier GUP
+ * invocation) is not handled and not supported.
+ * #. device-exclusive entries are considered "clean" and "old" by core-mm.
+ * Device drivers must update the folio state when informed by MMU
+ * notifiers.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
+ */
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- struct make_exclusive_args *args = priv;
- pte_t pteval;
- struct page *subpage;
- bool ret = true;
struct mmu_notifier_range range;
+ struct folio *folio, *fw_folio;
+ struct vm_area_struct *vma;
+ struct folio_walk fw;
+ struct page *page;
swp_entry_t entry;
pte_t swp_pte;
- pte_t ptent;
-
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
- vma->vm_mm, address, min(vma->vm_end,
- address + folio_size(folio)),
- args->owner);
- mmu_notifier_invalidate_range_start(&range);
-
- while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
- ptent = ptep_get(pvmw.pte);
- if (!pte_present(ptent)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- subpage = folio_page(folio,
- pte_pfn(ptent) - folio_pfn(folio));
- address = pvmw.address;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(ptent));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
-
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
- /*
- * Check that our target page is still mapped at the expected
- * address.
- */
- if (args->mm == mm && args->address == address &&
- pte_write(pteval))
- args->valid = true;
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- if (pte_write(pteval))
- entry = make_writable_device_exclusive_entry(
- page_to_pfn(subpage));
- else
- entry = make_readable_device_exclusive_entry(
- page_to_pfn(subpage));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ int ret;
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ mmap_assert_locked(mm);
+ addr = PAGE_ALIGN_DOWN(addr);
- /*
- * There is a reference on the page for the swap entry which has
- * been removed, so shouldn't take another.
- */
- folio_remove_rmap_pte(folio, subpage, vma);
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by a device,
+ * the GUP call would undo that first by triggering a fault.
+ *
+ * If any other device would already map this page exclusively, the
+ * fault will trigger a conversion to an ordinary
+ * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
+ */
+retry:
+ page = get_user_page_vma_remote(mm, addr,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ &vma);
+ if (IS_ERR(page))
+ return page;
+ folio = page_folio(page);
+
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
}
- mmu_notifier_invalidate_range_end(&range);
-
- return ret;
-}
-
-/**
- * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
- * @folio: The folio to replace page table entries for.
- * @mm: The mm_struct where the folio is expected to be mapped.
- * @address: Address where the folio is expected to be mapped.
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
- *
- * Tries to remove all the page table entries which are mapping this
- * folio and replace them with special device exclusive swap entries to
- * grant a device exclusive access to the folio.
- *
- * Context: Caller must hold the folio lock.
- * Return: false if the page is still mapped, or if it could not be unmapped
- * from the expected address. Otherwise returns true (success).
- */
-static bool folio_make_device_exclusive(struct folio *folio,
- struct mm_struct *mm, unsigned long address, void *owner)
-{
- struct make_exclusive_args args = {
- .mm = mm,
- .address = address,
- .owner = owner,
- .valid = false,
- };
- struct rmap_walk_control rwc = {
- .rmap_one = page_make_device_exclusive_one,
- .done = folio_not_mapped,
- .anon_lock = folio_lock_anon_vma_read,
- .arg = &args,
- };
+ ret = folio_lock_killable(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
/*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
+ * Inform secondary MMUs that we are going to convert this PTE to
+ * device-exclusive, such that they unmap it now. Note that the
+ * caller must filter this event out to prevent livelocks.
*/
- if (!folio_test_anon(folio))
- return false;
-
- rmap_walk(folio, &rwc);
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mm, addr, addr + PAGE_SIZE, owner);
+ mmu_notifier_invalidate_range_start(&range);
- return args.valid && !folio_mapcount(folio);
-}
+ /*
+ * Let's do a second walk and make sure we still find the same page
+ * mapped writable. Note that any page of an anonymous folio can
+ * only be mapped writable using exactly one PTE ("exclusive"), so
+ * there cannot be other mappings.
+ */
+ fw_folio = folio_walk_start(&fw, vma, addr, 0);
+ if (fw_folio != folio || fw.page != page ||
+ fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
+ if (fw_folio)
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ folio_unlock(folio);
+ folio_put(folio);
+ goto retry;
+ }
-/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
- *
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
- *
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
- *
- * A driver using this to program access from a device must use a mmu notifier
- * critical section to hold a device specific lock during programming. Once
- * programming is complete it should drop the page lock and reference after
- * which point CPU access to the page will revoke the exclusive access.
- */
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
-{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
-
- npages = get_user_pages_remote(mm, start, npages,
- FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ /* Nuke the page table entry so we get the uptodate dirty bit. */
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
- }
+ /* Set the dirty flag on the folio now the PTE is gone. */
+ if (pte_dirty(fw.pte))
+ folio_mark_dirty(folio);
- return npages;
+ /*
+ * Store the pfn of the page in a special device-exclusive PFN swap PTE.
+ * do_swap_page() will trigger the conversion back while holding the
+ * folio lock.
+ */
+ entry = make_device_exclusive_entry(page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(fw.pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ /* The pte is writable, uffd-wp does not apply. */
+ set_pte_at(mm, addr, fw.ptep, swp_pte);
+
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -2653,35 +2841,37 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_unlock_read(anon_vma);
}
-/*
- * rmap_walk_file - do something to file page using the object-based rmap method
- * @folio: the folio to be handled
- * @rwc: control variable according to each walk type
- * @locked: caller holds relevant rmap lock
+/**
+ * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
+ * of a page mapped within a specified page cache object at a specified offset.
*
- * Find all the mappings of a folio using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
+ * @folio: Either the folio whose mappings to traverse, or if NULL,
+ * the callbacks specified in @rwc will be configured such
+ * as to be able to look up mappings correctly.
+ * @mapping: The page cache object whose mapping VMAs we intend to
+ * traverse. If @folio is non-NULL, this should be equal to
+ * folio_mapping(folio).
+ * @pgoff_start: The offset within @mapping of the page which we are
+ * looking up. If @folio is non-NULL, this should be equal
+ * to folio_pgoff(folio).
+ * @nr_pages: The number of pages mapped by the mapping. If @folio is
+ * non-NULL, this should be equal to folio_nr_pages(folio).
+ * @rwc: The reverse mapping walk control object describing how
+ * the traversal should proceed.
+ * @locked: Is the @mapping already locked? If not, we acquire the
+ * lock.
*/
-static void rmap_walk_file(struct folio *folio,
- struct rmap_walk_control *rwc, bool locked)
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = folio_mapping(folio);
- pgoff_t pgoff_start, pgoff_end;
+ pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
struct vm_area_struct *vma;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_rwsem.
- */
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
+ VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
+ VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
- if (!mapping)
- return;
-
- pgoff_start = folio_pgoff(folio);
- pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
@@ -2696,8 +2886,7 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(vma, pgoff_start,
- folio_nr_pages(folio));
+ unsigned long address = vma_address(vma, pgoff_start, nr_pages);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2710,12 +2899,38 @@ lookup:
if (rwc->done && rwc->done(folio))
goto done;
}
-
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @folio: the folio to be handled
+ * @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
+ *
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ */
+static void rmap_walk_file(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
+{
+ /*
+ * The folio lock not only makes sure that folio->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the structure
+ * at mapping cannot be freed and reused yet, so we can safely take
+ * mapping->i_mmap_rwsem.
+ */
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (!folio->mapping)
+ return;
+
+ __rmap_walk_file(folio, folio->mapping, folio->index,
+ folio_nr_pages(folio), rwc, locked);
+}
+
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ea6109a8043..99327c30507c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include "internal.h"
-#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
/* Pretend that each entry is of this size in directory's i_size */
@@ -526,9 +525,9 @@ static bool shmem_confirm_swap(struct address_space *mapping,
* enables huge pages for the mount;
* SHMEM_HUGE_WITHIN_SIZE:
* only allocate huge pages if the page will be fully within i_size,
- * also respect fadvise()/madvise() hints;
+ * also respect madvise() hints;
* SHMEM_HUGE_ADVISE:
- * only allocate huge pages if requested with fadvise()/madvise();
+ * only allocate huge pages if requested with madvise();
*/
#define SHMEM_HUGE_NEVER 0
@@ -591,6 +590,28 @@ shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t w
return order > 0 ? BIT(order + 1) - 1 : 0;
}
+static unsigned int shmem_get_orders_within_size(struct inode *inode,
+ unsigned long within_size_orders, pgoff_t index,
+ loff_t write_end)
+{
+ pgoff_t aligned_index;
+ unsigned long order;
+ loff_t i_size;
+
+ order = highest_order(within_size_orders);
+ while (within_size_orders) {
+ aligned_index = round_up(index + 1, 1 << order);
+ i_size = max(write_end, i_size_read(inode));
+ i_size = round_up(i_size, PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= aligned_index)
+ return within_size_orders;
+
+ order = next_order(&within_size_orders, order);
+ }
+
+ return 0;
+}
+
static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
loff_t write_end, bool shmem_huge_force,
struct vm_area_struct *vma,
@@ -599,9 +620,6 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
0 : BIT(HPAGE_PMD_ORDER);
unsigned long within_size_orders;
- unsigned int order;
- pgoff_t aligned_index;
- loff_t i_size;
if (!S_ISREG(inode->i_mode))
return 0;
@@ -635,16 +653,11 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
index, write_end);
- order = highest_order(within_size_orders);
- while (within_size_orders) {
- aligned_index = round_up(index + 1, 1 << order);
- i_size = max(write_end, i_size_read(inode));
- i_size = round_up(i_size, PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= aligned_index)
- return within_size_orders;
+ within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
+ index, write_end);
+ if (within_size_orders > 0)
+ return within_size_orders;
- order = next_order(&within_size_orders, order);
- }
fallthrough;
case SHMEM_HUGE_ADVISE:
if (vm_flags & VM_HUGEPAGE)
@@ -1380,9 +1393,9 @@ static void shmem_evict_inode(struct inode *inode)
#endif
}
-static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, struct folio_batch *fbatch,
- pgoff_t *indices, unsigned int type)
+static unsigned int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
struct folio *folio;
@@ -1415,7 +1428,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
}
rcu_read_unlock();
- return xas.xa_index;
+ return folio_batch_count(fbatch);
}
/*
@@ -1462,8 +1475,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type)
do {
folio_batch_init(&fbatch);
- shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
- if (folio_batch_count(&fbatch) == 0) {
+ if (!shmem_find_swap_entries(mapping, start, &fbatch,
+ indices, type)) {
ret = 0;
break;
}
@@ -1533,7 +1546,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- swp_entry_t swap;
pgoff_t index;
int nr_pages;
bool split = false;
@@ -1548,7 +1560,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -1615,14 +1627,6 @@ try_split:
folio_mark_uptodate(folio);
}
- swap = folio_alloc_swap(folio);
- if (!swap.val) {
- if (nr_pages > 1)
- goto try_split;
-
- goto redirty;
- }
-
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the folio is
@@ -1635,20 +1639,20 @@ try_split:
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(folio, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
- NULL) == 0) {
+ if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
shmem_recalc_inode(inode, 0, nr_pages);
- swap_shmem_alloc(swap, nr_pages);
- shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
+ swap_shmem_alloc(folio->swap, nr_pages);
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
return swap_writepage(&folio->page, wbc);
}
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_folio(folio, swap);
+ if (nr_pages > 1)
+ goto try_split;
redirty:
folio_mark_dirty(folio);
if (wbc->for_reclaim)
@@ -1757,10 +1761,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
unsigned long vm_flags = vma ? vma->vm_flags : 0;
- pgoff_t aligned_index;
unsigned int global_orders;
- loff_t i_size;
- int order;
if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
@@ -1786,17 +1787,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
return READ_ONCE(huge_shmem_orders_inherit);
/* Allow mTHP that will be fully within i_size. */
- order = highest_order(within_size_orders);
- while (within_size_orders) {
- aligned_index = round_up(index + 1, 1 << order);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= aligned_index) {
- mask |= within_size_orders;
- break;
- }
-
- order = next_order(&within_size_orders, order);
- }
+ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
@@ -2017,7 +2008,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
__folio_set_swapbacked(new);
new->swap = entry;
- mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+ memcg1_swapin(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(new, shadow);
@@ -2162,15 +2153,16 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
- void *alloced_shadow = NULL;
- int alloced_order = 0, i;
+ int split_order = 0, entry_order;
+ int i;
/* Convert user data gfp flags to xarray node gfp flags */
gfp &= GFP_RECLAIM_MASK;
for (;;) {
- int order = -1, split_order = 0;
void *old = NULL;
+ int cur_order;
+ pgoff_t swap_index;
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -2179,60 +2171,56 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
goto unlock;
}
- order = xas_get_order(&xas);
+ entry_order = xas_get_order(&xas);
- /* Swap entry may have changed before we re-acquire the lock */
- if (alloced_order &&
- (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
+ if (!entry_order)
+ goto unlock;
/* Try to split large swap entry in pagecache */
- if (order > 0) {
- if (!alloced_order) {
- split_order = order;
+ cur_order = entry_order;
+ swap_index = round_down(index, 1 << entry_order);
+
+ split_order = xas_try_split_min_order(cur_order);
+
+ while (cur_order > 0) {
+ pgoff_t aligned_index =
+ round_down(index, 1 << cur_order);
+ pgoff_t swap_offset = aligned_index - swap_index;
+
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, cur_order);
+ if (xas_error(&xas))
goto unlock;
- }
- xas_split(&xas, old, order);
/*
* Re-set the swap entry after splitting, and the swap
* offset of the original large entry must be continuous.
*/
- for (i = 0; i < 1 << order; i++) {
- pgoff_t aligned_index = round_down(index, 1 << order);
+ for (i = 0; i < 1 << cur_order;
+ i += (1 << split_order)) {
swp_entry_t tmp;
- tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
+ tmp = swp_entry(swp_type(swap),
+ swp_offset(swap) + swap_offset +
+ i);
__xa_store(&mapping->i_pages, aligned_index + i,
swp_to_radix_entry(tmp), 0);
}
+ cur_order = split_order;
+ split_order = xas_try_split_min_order(split_order);
}
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
-error:
if (xas_error(&xas))
return xas_error(&xas);
- return alloced_order;
+ return entry_order;
}
/*
@@ -2253,7 +2241,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio *folio = NULL;
bool skip_swapcache = false;
swp_entry_t swap;
- int error, nr_pages;
+ int error, nr_pages, order, split_order;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@@ -2272,10 +2260,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
+ order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int order = xa_get_order(&mapping->i_pages, index);
bool fallback_order0 = false;
- int split_order;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -2339,6 +2326,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
+ } else if (order != folio_order(folio)) {
+ /*
+ * Swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
+ */
+ split_order = shmem_split_large_entry(inode, index, swap, gfp);
+ if (split_order < 0) {
+ error = split_order;
+ goto failed;
+ }
+
+ /*
+ * If the large swap entry has already been split, it is
+ * necessary to recalculate the new swap entry based on
+ * the old order alignment.
+ */
+ if (split_order > 0) {
+ pgoff_t offset = index - round_down(index, 1 << split_order);
+
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
}
alloced:
@@ -2346,7 +2356,8 @@ alloced:
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ !shmem_confirm_swap(mapping, index, swap) ||
+ xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST;
goto unlock;
}
@@ -3279,8 +3290,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
if (ret)
return ret;
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
@@ -3487,7 +3497,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3514,7 +3524,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3601,7 +3611,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
@@ -3889,16 +3899,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -5651,19 +5661,19 @@ static int __init setup_thp_shmem(char *str)
THP_ORDERS_ALL_FILE_DEFAULT);
}
- if (start == -EINVAL) {
+ if (start < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
start_size);
goto err;
}
- if (end == -EINVAL) {
+ if (end < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
end_size);
goto err;
}
- if (start < 0 || end < 0 || start > end)
+ if (start > end)
goto err;
nr = end - start + 1;
@@ -5830,7 +5840,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
* underlying inode. So users of this interface must do LSM checks at a
* higher layer. The users are the big_key and shm implementations. LSM
* checks are provided at the key or shm level rather than the inode.
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5842,7 +5852,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/**
* shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5855,7 +5865,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
* @mnt: the tmpfs mount where the file will be created
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 43afb56abbd3..6af13bcd2ab3 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -260,6 +260,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
" pagetables:%lukB"
" sec_pagetables:%lukB"
" all_unreclaimable? %s"
+ " Balloon:%lukB"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
@@ -285,7 +286,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES));
+ str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES),
+ K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
for_each_populated_zone(zone) {
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 794bd433cce0..20eaee3e97f7 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -214,10 +214,14 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d",
shrinker->name, shrinker->debugfs_id);
+ if (ret) {
+ shrinker->name = old;
+ kfree_const(new);
+ } else {
+ kfree_const(old);
+ }
mutex_unlock(&shrinker_mutex);
- kfree_const(old);
-
return ret;
}
EXPORT_SYMBOL(shrinker_debugfs_rename);
diff --git a/mm/slab.h b/mm/slab.h
index e9fd9bf0bfa6..05a21dc796e0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
-/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
- SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
-#else
-#define SLAB_DEBUG_FLAGS (0)
-#endif
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-
-/* Common flags available with current configuration */
-#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
-
-/* Common flags permitted for kmem_cache_create */
-#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
- SLAB_RED_ZONE | \
- SLAB_POISON | \
- SLAB_STORE_USER | \
- SLAB_TRACE | \
- SLAB_CONSISTENCY_CHECKS | \
- SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | \
- SLAB_ACCOUNT | \
- SLAB_KMALLOC | \
- SLAB_NO_MERGE | \
- SLAB_NO_USER_FLAGS)
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
#endif
+void kvfree_rcu_cb(struct rcu_head *head);
+
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4030907b6b7d..5be257e03c7c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
+#else
+ flags &= ~SLAB_DEBUG_FLAGS;
#endif
mutex_lock(&slab_mutex);
@@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
goto out_unlock;
}
- /* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
- /*
- * Some allocators will constraint the set of valid flags to a subset
- * of all flags. We expect them to define CACHE_CREATE_MASK in this
- * case, and we'll just provide them with a sanitized version of the
- * passed flags.
- */
- flags &= CACHE_CREATE_MASK;
-
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
WARN_ON(!args->usersize && args->useroffset) ||
@@ -1284,6 +1277,29 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head) {
+ kasan_record_aux_stack(ptr);
+ call_rcu(head, kvfree_rcu_cb);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree(ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+void __init kvfree_rcu_init(void)
+{
+}
+
+#else /* CONFIG_KVFREE_RCU_BATCHED */
+
/*
* This rcu parameter is runtime-read-only. It reflects
* a minimum allowed number of objects which can be cached
@@ -1304,6 +1320,8 @@ module_param(rcu_min_cached_objs, int, 0444);
static int rcu_delay_page_cache_fill_msec = 5000;
module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+static struct workqueue_struct *rcu_reclaim_wq;
+
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
@@ -1532,8 +1550,7 @@ kvfree_rcu_list(struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kvfree_callback("slab", head, offset);
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
+ kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -1632,10 +1649,10 @@ __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
if (delayed_work_pending(&krcp->monitor_work)) {
delay_left = krcp->monitor_work.timer.expires - jiffies;
if (delay < delay_left)
- mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
return;
}
- queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
}
static void
@@ -1733,7 +1750,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
// "free channels", the batch can handle. Break
// the loop since it is done with this CPU thus
// queuing an RCU work is _always_ success here.
- queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
+ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
WARN_ON_ONCE(!queued);
break;
}
@@ -1861,8 +1878,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
return true;
}
-#if !defined(CONFIG_TINY_RCU)
-
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
@@ -1883,12 +1898,12 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
- queue_delayed_work(system_unbound_wq,
+ queue_delayed_work(rcu_reclaim_wq,
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
}
}
@@ -2071,8 +2086,6 @@ void kvfree_rcu_barrier(void)
}
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
-#endif /* #if !defined(CONFIG_TINY_RCU) */
-
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -2120,6 +2133,10 @@ void __init kvfree_rcu_init(void)
int i, j;
struct shrinker *kfree_rcu_shrinker;
+ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_reclaim_wq);
+
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
@@ -2162,3 +2179,6 @@ void __init kvfree_rcu_init(void)
shrinker_register(kfree_rcu_shrinker);
}
+
+#endif /* CONFIG_KVFREE_RCU_BATCHED */
+
diff --git a/mm/slub.c b/mm/slub.c
index 1f50129dcfb3..b46f87662e71 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,6 +19,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
@@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object)
set_orig_size(s, (void *)object, s->object_size);
}
-static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
+ va_copy(args, argsp);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
va_end(args);
}
+static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ __slab_bug(s, fmt, args);
+ va_end(args);
+}
+
__printf(2, 3)
-static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
-
- dump_stack();
}
static void object_err(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *reason)
+ u8 *object, const char *reason)
{
if (slab_add_kunit_errors())
return;
- slab_bug(s, "%s", reason);
+ slab_bug(s, reason);
print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
}
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
return false;
}
+static void __slab_err(struct slab *slab)
+{
+ if (slab_in_kunit_test())
+ return;
+
+ print_slab_info(slab);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
+}
+
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
- char buf[100];
if (slab_add_kunit_errors())
return;
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ __slab_bug(s, fmt, args);
va_end(args);
- slab_bug(s, "%s", buf);
- print_slab_info(slab);
- dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ __slab_err(slab);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
s->inuse - poison_size);
}
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
void *from, void *to)
{
slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
@@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static pad_check_attributes int
check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *what,
- u8 *start, unsigned int value, unsigned int bytes)
+ u8 *object, const char *what, u8 *start, unsigned int value,
+ unsigned int bytes, bool slab_obj_print)
{
u8 *fault;
u8 *end;
@@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
if (slab_add_kunit_errors())
goto skip_bug_print;
- slab_bug(s, "%s overwritten", what);
- pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault - addr,
- fault[0], value);
+ pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ what, fault, end - 1, fault - addr, fault[0], value);
+
+ if (slab_obj_print)
+ object_err(s, slab, object, "Object corrupt");
skip_bug_print:
restore_bytes(s, what, value, fault, end);
@@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
return 1;
return check_bytes_and_report(s, slab, p, "Object padding",
- p + off, POISON_INUSE, size_from_object(s) - off);
+ p + off, POISON_INUSE, size_from_object(s) - off, true);
}
/* Check the pad bytes at the end of a slab page */
@@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
- fault, end - 1, fault - start);
+ slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
+ __slab_err(slab);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
}
@@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
- object - s->red_left_pad, val, s->red_left_pad))
+ object - s->red_left_pad, val, s->red_left_pad, ret))
ret = 0;
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
- endobject, val, s->inuse - s->object_size))
+ endobject, val, s->inuse - s->object_size, ret))
ret = 0;
if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
@@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->object_size > orig_size &&
!check_bytes_and_report(s, slab, object,
"kmalloc Redzone", p + orig_size,
- val, s->object_size - orig_size)) {
+ val, s->object_size - orig_size, ret)) {
ret = 0;
}
}
@@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
if (!check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
- s->inuse - s->object_size))
+ s->inuse - s->object_size, ret))
ret = 0;
}
}
@@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (kasan_meta_size < s->object_size - 1 &&
!check_bytes_and_report(s, slab, p, "Poison",
p + kasan_meta_size, POISON_FREE,
- s->object_size - kasan_meta_size - 1))
+ s->object_size - kasan_meta_size - 1, ret))
ret = 0;
if (kasan_meta_size < s->object_size &&
!check_bytes_and_report(s, slab, p, "End Poison",
- p + s->object_size - 1, POISON_END, 1))
+ p + s->object_size - 1, POISON_END, 1, ret))
ret = 0;
}
/*
@@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
ret = 0;
}
- if (!ret && !slab_in_kunit_test()) {
- print_trailer(s, slab, object);
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- }
-
return ret;
}
@@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
* Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
+static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
@@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
fp = slab->freelist;
while (fp && nr <= slab->objects) {
if (fp == search)
- return 1;
+ return true;
if (!check_valid_pointer(s, slab, fp)) {
if (object) {
object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, slab, "Freepointer corrupt");
slab->freelist = NULL;
slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
- return 0;
+ return false;
}
- break;
}
object = fp;
fp = get_freepointer(s, object);
nr++;
}
+ if (nr > slab->objects) {
+ slab_err(s, slab, "Freelist cycle detected");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab_fix(s, "Freelist cleared");
+ return false;
+ }
+
max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s,
slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
} else if (!slab->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
- dump_stack();
- } else
+ slab_err(NULL, slab, "No slab cache for object 0x%p",
+ object);
+ } else {
object_err(s, slab, object,
- "page slab pointer corrupt.");
+ "page slab pointer corrupt.");
+ }
return 0;
}
return 1;
@@ -2000,7 +2023,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
return 0;
}
-static inline void free_slab_obj_exts(struct slab *slab)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void free_slab_obj_exts(struct slab *slab)
{
struct slabobj_ext *obj_exts;
@@ -2077,33 +2101,37 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
-static inline void
-alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void
+__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext()) {
- struct slabobj_ext *obj_exts;
+ struct slabobj_ext *obj_exts;
- obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
- /*
- * Currently obj_exts is used only for allocation profiling.
- * If other users appear then mem_alloc_profiling_enabled()
- * check should be added before alloc_tag_add().
- */
- if (likely(obj_exts))
- alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
- }
+ obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
+ /*
+ * Currently obj_exts is used only for allocation profiling.
+ * If other users appear then mem_alloc_profiling_enabled()
+ * check should be added before alloc_tag_add().
+ */
+ if (likely(obj_exts))
+ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
}
static inline void
-alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
- int objects)
+alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (need_slab_obj_ext())
+ __alloc_tagging_slab_alloc_hook(s, object, flags);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static noinline void
+__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
{
struct slabobj_ext *obj_exts;
int i;
- if (!mem_alloc_profiling_enabled())
- return;
-
/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
return;
@@ -2119,6 +2147,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
}
}
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
+{
+ if (mem_alloc_profiling_enabled())
+ __alloc_tagging_slab_free_hook(s, slab, p, objects);
+}
+
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void
@@ -4241,6 +4277,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
+ __folio_set_large_kmalloc(folio);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -4716,6 +4753,11 @@ static void free_large_kmalloc(struct folio *folio, void *object)
{
unsigned int order = folio_order(folio);
+ if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
+ dump_page(&folio->page, "Not a kmalloc allocation");
+ return;
+ }
+
if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
@@ -4725,9 +4767,55 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
+ __folio_clear_large_kmalloc(folio);
folio_put(folio);
}
+/*
+ * Given an rcu_head embedded within an object obtained from kvmalloc at an
+ * offset < 4k, free the object in question.
+ */
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+ void *obj = head;
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *slab_addr;
+
+ if (is_vmalloc_addr(obj)) {
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ vfree(obj);
+ return;
+ }
+
+ folio = virt_to_folio(obj);
+ if (!folio_test_slab(folio)) {
+ /*
+ * rcu_head offset can be only less than page size so no need to
+ * consider folio order
+ */
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ free_large_kmalloc(folio, obj);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ slab_addr = folio_address(folio);
+
+ if (is_kfence_address(obj)) {
+ obj = kfence_object_start(obj);
+ } else {
+ unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+ obj = slab_addr + s->size * idx;
+ obj = fixup_red_left(s, obj);
+ }
+
+ slab_free(s, slab, obj, _RET_IP_);
+}
+
/**
* kfree - free previously allocated memory
* @object: pointer returned by kmalloc() or kmem_cache_alloc()
@@ -4878,6 +4966,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
}
EXPORT_SYMBOL(krealloc_noprof);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
+/**
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
+ kmalloc_gfp_adjust(flags, size),
+ node, _RET_IP_);
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ void *n;
+
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(kvrealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5570,14 +5820,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
- const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
void *p;
- slab_err(s, slab, text, s->name);
+ if (!slab_add_kunit_errors())
+ slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
spin_lock(&object_map_lock);
__fill_map(object_map, s, slab);
@@ -5592,6 +5842,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
}
}
spin_unlock(&object_map_lock);
+
+ __slab_err(slab);
#endif
}
@@ -5612,8 +5864,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, slab,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ list_slab_objects(s, slab);
}
}
spin_unlock_irq(&n->list_lock);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 3287ebadd167..fd2ab5118e13 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -30,6 +30,15 @@
#include <asm/dma.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * Flags for vmemmap_populate_range and friends.
+ */
+/* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
+#define VMEMMAP_POPULATE_PAGEREF 0x0001
#include "internal.h"
@@ -144,17 +153,18 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn, unsigned long flags)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
- if (!reuse) {
+ if (ptpfn == (unsigned long)-1) {
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
+ ptpfn = PHYS_PFN(__pa(p));
} else {
/*
* When a PTE/PMD entry is freed from the init_mm
@@ -165,10 +175,10 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
* and through vmemmap_populate_compound_pages() when
* slab is available.
*/
- get_page(reuse);
- p = page_to_virt(reuse);
+ if (flags & VMEMMAP_POPULATE_PAGEREF)
+ get_page(pfn_to_page(ptpfn));
}
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ entry = pfn_pte(ptpfn, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
return pte;
@@ -238,7 +248,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn,
+ unsigned long flags)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -258,7 +269,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return NULL;
- pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
if (!pte)
return NULL;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -269,13 +280,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
static int __meminit vmemmap_populate_range(unsigned long start,
unsigned long end, int node,
struct vmem_altmap *altmap,
- struct page *reuse)
+ unsigned long ptpfn,
+ unsigned long flags)
{
unsigned long addr = start;
pte_t *pte;
for (; addr < end; addr += PAGE_SIZE) {
- pte = vmemmap_populate_address(addr, node, altmap, reuse);
+ pte = vmemmap_populate_address(addr, node, altmap,
+ ptpfn, flags);
if (!pte)
return -ENOMEM;
}
@@ -286,7 +299,107 @@ static int __meminit vmemmap_populate_range(unsigned long start,
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap)
{
- return vmemmap_populate_range(start, end, node, altmap, NULL);
+ return vmemmap_populate_range(start, end, node, altmap, -1, 0);
+}
+
+/*
+ * Undo populate_hvo, and replace it with a normal base page mapping.
+ * Used in memory init in case a HVO mapping needs to be undone.
+ *
+ * This can happen when it is discovered that a memblock allocated
+ * hugetlb page spans multiple zones, which can only be verified
+ * after zones have been initialized.
+ *
+ * We know that:
+ * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
+ * allocated through memblock, and mapped.
+ *
+ * 2) The rest of the vmemmap pages are mirrors of the last head page.
+ */
+int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ unsigned long maddr, pfn;
+ pte_t *pte;
+ int headpages;
+
+ /*
+ * Should only be called early in boot, so nothing will
+ * be accessing these page structures.
+ */
+ WARN_ON(!early_boot_irqs_disabled);
+
+ headpages = headsize >> PAGE_SHIFT;
+
+ /*
+ * Clear mirrored mappings for tail page structs.
+ */
+ for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ pte_clear(&init_mm, maddr, pte);
+ }
+
+ /*
+ * Clear and free mappings for head page and first tail page
+ * structs.
+ */
+ for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ pfn = pte_pfn(ptep_get(pte));
+ pte_clear(&init_mm, maddr, pte);
+ memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
+ }
+
+ flush_tlb_kernel_range(addr, end);
+
+ return vmemmap_populate(addr, end, node, NULL);
+}
+
+/*
+ * Write protect the mirrored tail page structs for HVO. This will be
+ * called from the hugetlb code when gathering and initializing the
+ * memblock allocated gigantic pages. The write protect can't be
+ * done earlier, since it can't be guaranteed that the reserved
+ * page structures will not be written to during initialization,
+ * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
+ *
+ * The PTEs are known to exist, and nothing else should be touching
+ * these pages. The caller is responsible for any TLB flushing.
+ */
+void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ unsigned long maddr;
+ pte_t *pte;
+
+ for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
+ pte = virt_to_kpte(maddr);
+ ptep_set_wrprotect(&init_mm, maddr, pte);
+ }
+}
+
+/*
+ * Populate vmemmap pages HVO-style. The first page contains the head
+ * page and needed tail pages, the other ones are mirrors of the first
+ * page.
+ */
+int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
+ int node, unsigned long headsize)
+{
+ pte_t *pte;
+ unsigned long maddr;
+
+ for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
+ pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
+ if (!pte)
+ return -ENOMEM;
+ }
+
+ /*
+ * Reuse the last page struct page mapped above for the rest.
+ */
+ return vmemmap_populate_range(maddr, end, node, NULL,
+ pte_pfn(ptep_get(pte)), 0);
}
void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
@@ -409,7 +522,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
* with just tail struct pages.
*/
return vmemmap_populate_range(start, end, node, NULL,
- pte_page(ptep_get(pte)));
+ pte_pfn(ptep_get(pte)),
+ VMEMMAP_POPULATE_PAGEREF);
}
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -417,13 +531,13 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
unsigned long next, last = addr + size;
/* Populate the head page vmemmap page */
- pte = vmemmap_populate_address(addr, node, NULL, NULL);
+ pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
/* Populate the tail pages vmemmap page */
next = addr + PAGE_SIZE;
- pte = vmemmap_populate_address(next, node, NULL, NULL);
+ pte = vmemmap_populate_address(next, node, NULL, -1, 0);
if (!pte)
return -ENOMEM;
@@ -433,7 +547,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
- pte_page(ptep_get(pte)));
+ pte_pfn(ptep_get(pte)),
+ VMEMMAP_POPULATE_PAGEREF);
if (rc)
return -ENOMEM;
}
@@ -470,3 +585,28 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
return pfn_to_page(pfn);
}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
+/*
+ * This is called just before initializing sections for a NUMA node.
+ * Any special initialization that needs to be done before the
+ * generic initialization can be done from here. Sections that
+ * are initialized in hooks called from here will be skipped by
+ * the generic initialization.
+ */
+void __init sparse_vmemmap_init_nid_early(int nid)
+{
+ hugetlb_vmemmap_init_early(nid);
+}
+
+/*
+ * This is called just before the initialization of page structures
+ * through memmap_init. Zones are now initialized, so any work that
+ * needs to be done that needs zone information can be done from
+ * here.
+ */
+void __init sparse_vmemmap_init_nid_late(int nid)
+{
+ hugetlb_vmemmap_init_late(nid);
+}
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 133b033d0cba..3c012cf83cc2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -170,11 +170,6 @@ static void __section_mark_present(struct mem_section *ms,
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-#define for_each_present_section_nr(start, section_nr) \
- for (section_nr = next_present_section_nr(start-1); \
- section_nr != -1; \
- section_nr = next_present_section_nr(section_nr))
-
static inline unsigned long first_present_section_nr(void)
{
return next_present_section_nr(-1);
@@ -408,13 +403,13 @@ static void __init check_usemap_section_nr(int nid,
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static unsigned long __init section_map_size(void)
+unsigned long __init section_map_size(void)
{
return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}
#else
-static unsigned long __init section_map_size(void)
+unsigned long __init section_map_size(void)
{
return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
@@ -495,6 +490,44 @@ void __weak __meminit vmemmap_populate_print_last(void)
{
}
+static void *sparse_usagebuf __meminitdata;
+static void *sparse_usagebuf_end __meminitdata;
+
+/*
+ * Helper function that is used for generic section initialization, and
+ * can also be used by any hooks added above.
+ */
+void __init sparse_init_early_section(int nid, struct page *map,
+ unsigned long pnum, unsigned long flags)
+{
+ BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end);
+ check_usemap_section_nr(nid, sparse_usagebuf);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+ sparse_usagebuf, SECTION_IS_EARLY | flags);
+ sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size();
+}
+
+static int __init sparse_usage_init(int nid, unsigned long map_count)
+{
+ unsigned long size;
+
+ size = mem_section_usage_size() * map_count;
+ sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section(
+ NODE_DATA(nid), size);
+ if (!sparse_usagebuf) {
+ sparse_usagebuf_end = NULL;
+ return -ENOMEM;
+ }
+
+ sparse_usagebuf_end = sparse_usagebuf + size;
+ return 0;
+}
+
+static void __init sparse_usage_fini(void)
+{
+ sparse_usagebuf = sparse_usagebuf_end = NULL;
+}
+
/*
* Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
* And number of present sections in this node is map_count.
@@ -503,47 +536,54 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count)
{
- struct mem_section_usage *usage;
unsigned long pnum;
struct page *map;
+ struct mem_section *ms;
- usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
- mem_section_usage_size() * map_count);
- if (!usage) {
+ if (sparse_usage_init(nid, map_count)) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
+
sparse_buffer_init(map_count * section_map_size(), nid);
+
+ sparse_vmemmap_init_nid_early(nid);
+
for_each_present_section_nr(pnum_begin, pnum) {
unsigned long pfn = section_nr_to_pfn(pnum);
if (pnum >= pnum_end)
break;
- map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
- nid, NULL, NULL);
- if (!map) {
- pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
- __func__, nid);
- pnum_begin = pnum;
- sparse_buffer_fini();
- goto failed;
+ ms = __nr_to_section(pnum);
+ if (!preinited_vmemmap_section(ms)) {
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL, NULL);
+ if (!map) {
+ pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
+ __func__, nid);
+ pnum_begin = pnum;
+ sparse_usage_fini();
+ sparse_buffer_fini();
+ goto failed;
+ }
+ sparse_init_early_section(nid, map, pnum, 0);
}
- check_usemap_section_nr(nid, usage);
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
- SECTION_IS_EARLY);
- usage = (void *) usage + mem_section_usage_size();
}
+ sparse_usage_fini();
sparse_buffer_fini();
return;
failed:
- /* We failed to allocate, mark all the following pnums as not present */
+ /*
+ * We failed to allocate, mark all the following pnums as not present,
+ * except the ones already initialized earlier.
+ */
for_each_present_section_nr(pnum_begin, pnum) {
- struct mem_section *ms;
-
if (pnum >= pnum_end)
break;
ms = __nr_to_section(pnum);
+ if (!preinited_vmemmap_section(ms))
+ ms->section_mem_map = 0;
ms->section_mem_map = 0;
}
}
diff --git a/mm/swap.c b/mm/swap.c
index fc8281ef4241..77b2d5997873 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -45,7 +45,7 @@
/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
-const int page_cluster_max = 31;
+static const int page_cluster_max = 31;
struct cpu_fbatches {
/*
@@ -956,8 +956,6 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_folio_refs(folio, nr_refs))
- continue;
if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_folio(folio);
continue;
@@ -1076,6 +1074,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
+static const struct ctl_table swap_sysctl_table[] = {
+ {
+ .procname = "page-cluster",
+ .data = &page_cluster,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
+ }
+};
+
/*
* Perform any setup for the swap system
*/
@@ -1092,4 +1102,6 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+
+ register_sysctl_init("vm", swap_sysctl_table);
}
diff --git a/mm/swap.h b/mm/swap.h
index ad2f121de970..6f4a3f927edb 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,6 +3,7 @@
#define _MM_SWAP_H
struct mempolicy;
+extern int page_cluster;
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
@@ -50,7 +51,6 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry)
}
void show_swap_cache_info(void);
-bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp);
@@ -163,11 +163,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
return filemap_get_folio(mapping, index);
}
-static inline bool add_to_swap(struct folio *folio)
-{
- return false;
-}
-
static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
{
return NULL;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index be39078f255b..de779fed8c21 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
* entries must not have been charged
*
* @folio: the folio that the swap entry belongs to
+ * @id: mem_cgroup ID to be recorded
* @ent: the first swap entry to be recorded
*/
-void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
+void swap_cgroup_record(struct folio *folio, unsigned short id,
+ swp_entry_t ent)
{
unsigned int nr_ents = folio_nr_pages(folio);
struct swap_cgroup *map;
@@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
map = swap_cgroup_ctrl[swp_type(ent)].map;
do {
- old = __swap_cgroup_id_xchg(map, offset,
- mem_cgroup_id(folio_memcg(folio)));
+ old = __swap_cgroup_id_xchg(map, offset, id);
VM_BUG_ON(old);
} while (++offset != end);
}
@@ -91,8 +92,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
*/
unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
{
- pgoff_t offset = swp_offset(ent);
- pgoff_t end = offset + nr_ents;
+ pgoff_t offset, end;
struct swap_cgroup *map;
unsigned short old, iter = 0;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
deleted file mode 100644
index 9c7c171df7ba..000000000000
--- a/mm/swap_slots.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Manage cache of swap slots to be used for and returned from
- * swap.
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * We allocate the swap slots from the global pool and put
- * it into local per cpu caches. This has the advantage
- * of no needing to acquire the swap_info lock every time
- * we need a new slot.
- *
- * There is also opportunity to simply return the slot
- * to local caches without needing to acquire swap_info
- * lock. We do not reuse the returned slots directly but
- * move them back to the global pool in a batch. This
- * allows the slots to coalesce and reduce fragmentation.
- *
- * The swap entry allocated is marked with SWAP_HAS_CACHE
- * flag in map_count that prevents it from being allocated
- * again from the global pool.
- *
- * The swap slots cache is protected by a mutex instead of
- * a spin lock as when we search for slots with scan_swap_map,
- * we can possibly sleep.
- */
-
-#include <linux/swap_slots.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-
-static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
-static bool swap_slot_cache_active;
-bool swap_slot_cache_enabled;
-static bool swap_slot_cache_initialized;
-static DEFINE_MUTEX(swap_slots_cache_mutex);
-/* Serialize swap slots cache enable/disable operations */
-static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
-
-static void __drain_swap_slots_cache(void);
-
-#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
-
-static void deactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = false;
- __drain_swap_slots_cache();
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-static void reactivate_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_mutex);
- swap_slot_cache_active = true;
- mutex_unlock(&swap_slots_cache_mutex);
-}
-
-/* Must not be called with cpu hot plug lock */
-void disable_swap_slots_cache_lock(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- swap_slot_cache_enabled = false;
- if (swap_slot_cache_initialized) {
- /* serialize with cpu hotplug operations */
- cpus_read_lock();
- __drain_swap_slots_cache();
- cpus_read_unlock();
- }
-}
-
-static void __reenable_swap_slots_cache(void)
-{
- swap_slot_cache_enabled = has_usable_swap();
-}
-
-void reenable_swap_slots_cache_unlock(void)
-{
- __reenable_swap_slots_cache();
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-static bool check_cache_active(void)
-{
- long pages;
-
- if (!swap_slot_cache_enabled)
- return false;
-
- pages = get_nr_swap_pages();
- if (!swap_slot_cache_active) {
- if (pages > num_online_cpus() *
- THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
- reactivate_swap_slots_cache();
- goto out;
- }
-
- /* if global pool of slot caches too low, deactivate cache */
- if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
- deactivate_swap_slots_cache();
-out:
- return swap_slot_cache_active;
-}
-
-static int alloc_swap_slot_cache(unsigned int cpu)
-{
- struct swap_slots_cache *cache;
- swp_entry_t *slots;
-
- /*
- * Do allocation outside swap_slots_cache_mutex
- * as kvzalloc could trigger reclaim and folio_alloc_swap,
- * which can lock swap_slots_cache_mutex.
- */
- slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots)
- return -ENOMEM;
-
- mutex_lock(&swap_slots_cache_mutex);
- cache = &per_cpu(swp_slots, cpu);
- if (cache->slots) {
- /* cache already allocated */
- mutex_unlock(&swap_slots_cache_mutex);
-
- kvfree(slots);
-
- return 0;
- }
-
- if (!cache->lock_initialized) {
- mutex_init(&cache->alloc_lock);
- cache->lock_initialized = true;
- }
- cache->nr = 0;
- cache->cur = 0;
- cache->n_ret = 0;
- /*
- * We initialized alloc_lock and free_lock earlier. We use
- * !cache->slots or !cache->slots_ret to know if it is safe to acquire
- * the corresponding lock and use the cache. Memory barrier below
- * ensures the assumption.
- */
- mb();
- cache->slots = slots;
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots)
-{
- struct swap_slots_cache *cache;
-
- cache = &per_cpu(swp_slots, cpu);
- if (cache->slots) {
- mutex_lock(&cache->alloc_lock);
- swapcache_free_entries(cache->slots + cache->cur, cache->nr);
- cache->cur = 0;
- cache->nr = 0;
- if (free_slots && cache->slots) {
- kvfree(cache->slots);
- cache->slots = NULL;
- }
- mutex_unlock(&cache->alloc_lock);
- }
-}
-
-static void __drain_swap_slots_cache(void)
-{
- unsigned int cpu;
-
- /*
- * This function is called during
- * 1) swapoff, when we have to make sure no
- * left over slots are in cache when we remove
- * a swap device;
- * 2) disabling of swap slot cache, when we run low
- * on swap slots when allocating memory and need
- * to return swap slots to global pool.
- *
- * We cannot acquire cpu hot plug lock here as
- * this function can be invoked in the cpu
- * hot plug path:
- * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
- * -> memory allocation -> direct reclaim -> folio_alloc_swap
- * -> drain_swap_slots_cache
- *
- * Hence the loop over current online cpu below could miss cpu that
- * is being brought online but not yet marked as online.
- * That is okay as we do not schedule and run anything on a
- * cpu before it has been marked online. Hence, we will not
- * fill any swap slots in slots cache of such cpu.
- * There are no slots on such cpu that need to be drained.
- */
- for_each_online_cpu(cpu)
- drain_slots_cache_cpu(cpu, false);
-}
-
-static int free_slot_cache(unsigned int cpu)
-{
- mutex_lock(&swap_slots_cache_mutex);
- drain_slots_cache_cpu(cpu, true);
- mutex_unlock(&swap_slots_cache_mutex);
- return 0;
-}
-
-void enable_swap_slots_cache(void)
-{
- mutex_lock(&swap_slots_cache_enable_mutex);
- if (!swap_slot_cache_initialized) {
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
-
- swap_slot_cache_initialized = true;
- }
-
- __reenable_swap_slots_cache();
-out_unlock:
- mutex_unlock(&swap_slots_cache_enable_mutex);
-}
-
-/* called with swap slot cache's alloc lock held */
-static int refill_swap_slots_cache(struct swap_slots_cache *cache)
-{
- if (!use_swap_slot_cache)
- return 0;
-
- cache->cur = 0;
- if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
- cache->slots, 0);
-
- return cache->nr;
-}
-
-swp_entry_t folio_alloc_swap(struct folio *folio)
-{
- swp_entry_t entry;
- struct swap_slots_cache *cache;
-
- entry.val = 0;
-
- if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, &entry, folio_order(folio));
- goto out;
- }
-
- /*
- * Preemption is allowed here, because we may sleep
- * in refill_swap_slots_cache(). But it is safe, because
- * accesses to the per-CPU data structure are protected by the
- * mutex cache->alloc_lock.
- *
- * The alloc path here does not touch cache->slots_ret
- * so cache->free_lock is not taken.
- */
- cache = raw_cpu_ptr(&swp_slots);
-
- if (likely(check_cache_active() && cache->slots)) {
- mutex_lock(&cache->alloc_lock);
- if (cache->slots) {
-repeat:
- if (cache->nr) {
- entry = cache->slots[cache->cur];
- cache->slots[cache->cur++].val = 0;
- cache->nr--;
- } else if (refill_swap_slots_cache(cache)) {
- goto repeat;
- }
- }
- mutex_unlock(&cache->alloc_lock);
- if (entry.val)
- goto out;
- }
-
- get_swap_pages(1, &entry, 0);
-out:
- if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_folio(folio, entry);
- entry.val = 0;
- }
- return entry;
-}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ca42b2be64d9..68fd981b514f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -20,7 +20,6 @@
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
-#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
@@ -85,7 +84,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
/*
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and private instead of mapping and index.
+ * but sets SwapCache flag and 'swap' instead of mapping and index.
*/
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
@@ -167,67 +166,6 @@ void __delete_from_swap_cache(struct folio *folio,
__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
-/**
- * add_to_swap - allocate swap space for a folio
- * @folio: folio we want to move to swap
- *
- * Allocate swap space for the folio and add the folio to the
- * swap cache.
- *
- * Context: Caller needs to hold the folio lock.
- * Return: Whether the folio was added to the swap cache.
- */
-bool add_to_swap(struct folio *folio)
-{
- swp_entry_t entry;
- int err;
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
-
- entry = folio_alloc_swap(folio);
- if (!entry.val)
- return false;
-
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache.
- */
- err = add_to_swap_cache(folio, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
- if (err)
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- goto fail;
- /*
- * Normally the folio will be dirtied in unmap because its
- * pte should be dirty. A special case is MADV_FREE page. The
- * page's pte could have dirty bit cleared but the folio's
- * SwapBacked flag is still set because clearing the dirty bit
- * and SwapBacked flag has no lock protected. For such folio,
- * unmap will not set dirty bit for it, so folio reclaim will
- * not write the folio out. This can cause data corruption when
- * the folio is swapped in later. Always setting the dirty flag
- * for the folio solves the problem.
- */
- folio_mark_dirty(folio);
-
- return true;
-
-fail:
- put_swap_folio(folio, entry);
- return false;
-}
-
/*
* This must be called only on folios that have
* been verified to be in the swap cache and locked.
@@ -270,9 +208,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
- curr >>= SWAP_ADDRESS_SPACE_SHIFT;
- curr++;
- curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
if (curr > end)
break;
}
@@ -432,17 +368,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct folio *folio;
struct folio *new_folio = NULL;
struct folio *result = NULL;
void *shadow = NULL;
*new_page_allocated = false;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
-
for (;;) {
int err;
/*
@@ -457,13 +389,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* Just skip read ahead for unused swap slot.
- * During swap_off when swap_slot_cache is disabled,
- * we have to handle the race between putting
- * swap entry in swap cache and marking swap slot
- * as SWAP_HAS_CACHE. That's done in later part of code or
- * else swap_off will be aborted if we return NULL.
*/
- if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ if (!swap_entry_swapped(si, entry))
goto put_and_return;
/*
@@ -521,7 +448,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- mem_cgroup_swapin_uncharge_swap(entry, 1);
+ memcg1_swapin(entry, 1);
if (shadow)
workingset_refault(new_folio, shadow);
@@ -538,7 +465,6 @@ fail_unlock:
put_swap_folio(new_folio, entry);
folio_unlock(new_folio);
put_and_return:
- put_swap_device(si);
if (!(*new_page_allocated) && new_folio)
folio_put(new_folio);
return result;
@@ -558,11 +484,16 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug)
{
+ struct swap_info_struct *si;
bool page_allocated;
struct mempolicy *mpol;
pgoff_t ilx;
struct folio *folio;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
+
mpol = get_vma_policy(vma, addr, 0, &ilx);
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
@@ -570,6 +501,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (page_allocated)
swap_read_folio(folio, plug);
+
+ put_swap_device(si);
return folio;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ba19430dd4ea..2eff8b51a945 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,7 +37,6 @@
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
-#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
@@ -116,6 +115,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+struct percpu_swap_cluster {
+ struct swap_info_struct *si[SWAP_NR_ORDERS];
+ unsigned long offset[SWAP_NR_ORDERS];
+ local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
+ .si = { NULL },
+ .offset = { SWAP_ENTRY_INVALID },
+ .lock = INIT_LOCAL_LOCK(),
+};
+
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
if (type >= MAX_SWAPFILES)
@@ -158,10 +169,8 @@ static long swap_usage_in_pages(struct swap_info_struct *si)
#define TTRS_UNMAPPED 0x2
/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL 0x4
-/* Reclaim directly, bypass the slot cache and don't touch device lock */
-#define TTRS_DIRECT 0x8
-static bool swap_is_has_cache(struct swap_info_struct *si,
+static bool swap_only_has_cache(struct swap_info_struct *si,
unsigned long offset, int nr_pages)
{
unsigned char *map = si->swap_map + offset;
@@ -210,6 +219,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
int ret, nr_pages;
bool need_reclaim;
+again:
folio = filemap_get_folio(address_space, swap_cache_index(entry));
if (IS_ERR(folio))
return 0;
@@ -227,8 +237,16 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
if (!folio_trylock(folio))
goto out;
- /* offset could point to the middle of a large folio */
+ /*
+ * Offset could point to the middle of a large folio, or folio
+ * may no longer point to the expected offset before it's locked.
+ */
entry = folio->swap;
+ if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
offset = swp_offset(entry);
need_reclaim = ((flags & TTRS_ANYWAY) ||
@@ -243,28 +261,13 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* reference or pending writeback, and can't be allocated to others.
*/
ci = lock_cluster(si, offset);
- need_reclaim = swap_is_has_cache(si, offset, nr_pages);
+ need_reclaim = swap_only_has_cache(si, offset, nr_pages);
unlock_cluster(ci);
if (!need_reclaim)
goto out_unlock;
- if (!(flags & TTRS_DIRECT)) {
- /* Free through slot cache */
- delete_from_swap_cache(folio);
- folio_set_dirty(folio);
- ret = nr_pages;
- goto out_unlock;
- }
-
- xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
- folio_ref_sub(folio, nr_pages);
+ delete_from_swap_cache(folio);
folio_set_dirty(folio);
-
- ci = lock_cluster(si, offset);
- swap_entry_range_free(si, ci, entry, nr_pages);
- unlock_cluster(ci);
ret = nr_pages;
out_unlock:
folio_unlock(folio);
@@ -479,15 +482,6 @@ static void move_cluster(struct swap_info_struct *si,
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
- unsigned int idx = cluster_index(si, ci);
- /*
- * If scan_swap_map_slots() can't find a free cluster, it will check
- * si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
- * It will be cleared after discard
- */
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- SWAP_MAP_BAD, SWAPFILE_CLUSTER);
VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
schedule_work(&si->discard_work);
@@ -556,7 +550,7 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si)
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
/*
* Delete the cluster from list to prepare for discard, but keep
- * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
+ * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
* pointing to it, or ran into by relocate_cluster.
*/
list_del(&ci->list);
@@ -571,8 +565,6 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si)
* return the cluster to allocation list.
*/
ci->flags = CLUSTER_FLAG_NONE;
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- 0, SWAPFILE_CLUSTER);
__free_cluster(si, ci);
spin_unlock(&ci->lock);
ret = true;
@@ -653,7 +645,8 @@ static void relocate_cluster(struct swap_info_struct *si,
return;
if (!ci->count) {
- free_cluster(si, ci);
+ if (ci->flags != CLUSTER_FLAG_FREE)
+ free_cluster(si, ci);
} else if (ci->count != SWAPFILE_CLUSTER) {
if (ci->flags != CLUSTER_FLAG_FRAG)
move_cluster(si, ci, &si->frag_clusters[ci->order],
@@ -698,7 +691,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
offset++;
break;
case SWAP_HAS_CACHE:
- nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
+ nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
if (nr_reclaim > 0)
offset += nr_reclaim;
else
@@ -729,6 +722,9 @@ static bool cluster_scan_range(struct swap_info_struct *si,
unsigned long offset, end = start + nr_pages;
unsigned char *map = si->swap_map;
+ if (cluster_is_empty(ci))
+ return true;
+
for (offset = start; offset < end; offset++) {
switch (READ_ONCE(map[offset])) {
case 0:
@@ -820,14 +816,15 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
out:
relocate_cluster(si, ci);
unlock_cluster(ci);
- if (si->flags & SWP_SOLIDSTATE)
- __this_cpu_write(si->percpu_cluster->next[order], next);
- else
+ if (si->flags & SWP_SOLIDSTATE) {
+ this_cpu_write(percpu_swap_cluster.offset[order], next);
+ this_cpu_write(percpu_swap_cluster.si[order], si);
+ } else {
si->global_cluster->next[order] = next;
+ }
return found;
}
-/* Return true if reclaimed a whole cluster */
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -848,7 +845,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
spin_unlock(&ci->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
- TTRS_ANYWAY | TTRS_DIRECT);
+ TTRS_ANYWAY);
spin_lock(&ci->lock);
if (nr_reclaim) {
offset += abs(nr_reclaim);
@@ -858,6 +855,10 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
offset++;
}
+ /* in case no swap cache is reclaimed */
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ relocate_cluster(si, ci);
+
unlock_cluster(ci);
if (to_scan <= 0)
break;
@@ -874,27 +875,29 @@ static void swap_reclaim_work(struct work_struct *work)
}
/*
- * Try to get swap entries with specified order from current cpu's swap entry
- * pool (a cluster). This might involve allocating a new cluster for current CPU
- * too.
+ * Try to allocate swap entries with specified order and try set a new
+ * cluster for current CPU too.
*/
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned char usage)
{
struct swap_cluster_info *ci;
- unsigned int offset, found = 0;
+ unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
- if (si->flags & SWP_SOLIDSTATE) {
- /* Fast path using per CPU cluster */
- local_lock(&si->percpu_cluster->lock);
- offset = __this_cpu_read(si->percpu_cluster->next[order]);
- } else {
+ /*
+ * Swapfile is not block device so unable
+ * to allocate large entries.
+ */
+ if (order && !(si->flags & SWP_BLKDEV))
+ return 0;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
/* Serialize HDD SWAP allocation for each device. */
spin_lock(&si->global_cluster_lock);
offset = si->global_cluster->next[order];
- }
+ if (offset == SWAP_ENTRY_INVALID)
+ goto new_cluster;
- if (offset) {
ci = lock_cluster(si, offset);
/* Cluster could have been used by another order */
if (cluster_is_usable(ci, order)) {
@@ -985,9 +988,7 @@ new_cluster:
}
}
done:
- if (si->flags & SWP_SOLIDSTATE)
- local_unlock(&si->percpu_cluster->lock);
- else
+ if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
return found;
}
@@ -1101,7 +1102,7 @@ static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
/*
* If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
- * remove it from the plist.
+ * add it to the plist.
*/
if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
add_to_avail_list(si, false);
@@ -1155,61 +1156,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_usage_sub(si, nr_entries);
}
-static int cluster_alloc_swap(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
-{
- int n_ret = 0;
-
- while (n_ret < nr) {
- unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
-
- if (!offset)
- break;
- slots[n_ret++] = swp_entry(si->type, offset);
- }
-
- return n_ret;
-}
-
-static int scan_swap_map_slots(struct swap_info_struct *si,
- unsigned char usage, int nr,
- swp_entry_t slots[], int order)
-{
- unsigned int nr_pages = 1 << order;
-
- /*
- * We try to cluster swap pages by allocating them sequentially
- * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
- * way, however, we resort to first-free allocation, starting
- * a new cluster. This prevents us from scattering swap pages
- * all over the entire swap partition, so that we reduce
- * overall disk seek times between swap pages. -- sct
- * But we do now try to find an empty cluster. -Andrea
- * And we let swap pages go all over an SSD partition. Hugh
- */
- if (order > 0) {
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (!IS_ENABLED(CONFIG_THP_SWAP) ||
- nr_pages > SWAPFILE_CLUSTER) {
- VM_WARN_ON_ONCE(1);
- return 0;
- }
-
- /*
- * Swapfile is not block device so unable
- * to allocate large entries.
- */
- if (!(si->flags & SWP_BLKDEV))
- return 0;
- }
-
- return cluster_alloc_swap(si, usage, nr, slots, order);
-}
-
static bool get_swap_device_info(struct swap_info_struct *si)
{
if (!percpu_ref_tryget_live(&si->users))
@@ -1226,39 +1172,65 @@ static bool get_swap_device_info(struct swap_info_struct *si)
return true;
}
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
+/*
+ * Fast path try to get swap entries with specified order from current
+ * CPU's swap entry pool (a cluster).
+ */
+static bool swap_alloc_fast(swp_entry_t *entry,
+ int order)
{
- int order = swap_entry_order(entry_order);
- unsigned long size = 1 << order;
- struct swap_info_struct *si, *next;
- long avail_pgs;
- int n_ret = 0;
- int node;
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si;
+ unsigned int offset, found = SWAP_ENTRY_INVALID;
- spin_lock(&swap_avail_lock);
+ /*
+ * Once allocated, swap_info_struct will never be completely freed,
+ * so checking it's liveness by get_swap_device_info is enough.
+ */
+ si = this_cpu_read(percpu_swap_cluster.si[order]);
+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+ if (!si || !offset || !get_swap_device_info(si))
+ return false;
- avail_pgs = atomic_long_read(&nr_swap_pages) / size;
- if (avail_pgs <= 0) {
- spin_unlock(&swap_avail_lock);
- goto noswap;
+ ci = lock_cluster(si, offset);
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
+ if (found)
+ *entry = swp_entry(si->type, found);
+ } else {
+ unlock_cluster(ci);
}
- n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
+ put_swap_device(si);
+ return !!found;
+}
- atomic_long_sub(n_goal * size, &nr_swap_pages);
+/* Rotate the device and switch to a new cluster */
+static bool swap_alloc_slow(swp_entry_t *entry,
+ int order)
+{
+ int node;
+ unsigned long offset;
+ struct swap_info_struct *si, *next;
-start_over:
node = numa_node_id();
+ spin_lock(&swap_avail_lock);
+start_over:
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
- /* requeue si to after same-priority siblings */
+ /* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries, order);
+ offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
put_swap_device(si);
- if (n_ret || size > 1)
- goto check_out;
+ if (offset) {
+ *entry = swp_entry(si->type, offset);
+ return true;
+ }
+ if (order)
+ return false;
}
spin_lock(&swap_avail_lock);
@@ -1276,15 +1248,68 @@ start_over:
if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
-
spin_unlock(&swap_avail_lock);
+ return false;
+}
+
+/**
+ * folio_alloc_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
+ * @gfp: gfp mask for shadow nodes
+ *
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
+ *
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
+ */
+int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+{
+ unsigned int order = folio_order(folio);
+ unsigned int size = 1 << order;
+ swp_entry_t entry = {};
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
+
+ /*
+ * Should not even be attempting large allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ local_lock(&percpu_swap_cluster.lock);
+ if (!swap_alloc_fast(&entry, order))
+ swap_alloc_slow(&entry, order);
+ local_unlock(&percpu_swap_cluster.lock);
-check_out:
- if (n_ret < n_goal)
- atomic_long_add((long)(n_goal - n_ret) * size,
- &nr_swap_pages);
-noswap:
- return n_ret;
+ /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
+ if (mem_cgroup_try_charge_swap(folio, entry))
+ goto out_free;
+
+ if (!entry.val)
+ return -ENOMEM;
+
+ /*
+ * XArray node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
+ goto out_free;
+
+ atomic_long_sub(size, &nr_swap_pages);
+ return 0;
+
+out_free:
+ put_swap_folio(folio, entry);
+ return -ENOMEM;
}
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
@@ -1569,7 +1594,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
return;
ci = lock_cluster(si, offset);
- if (swap_is_has_cache(si, offset, size))
+ if (swap_only_has_cache(si, offset, size))
swap_entry_range_free(si, ci, entry, size);
else {
for (int i = 0; i < size; i++, entry.val++) {
@@ -1580,25 +1605,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
unlock_cluster(ci);
}
-void swapcache_free_entries(swp_entry_t *entries, int n)
-{
- int i;
- struct swap_cluster_info *ci;
- struct swap_info_struct *si = NULL;
-
- if (n <= 0)
- return;
-
- for (i = 0; i < n; ++i) {
- si = _swap_info_get(entries[i]);
- if (si) {
- ci = lock_cluster(si, swp_offset(entries[i]));
- swap_entry_range_free(si, ci, entries[i], 1);
- unlock_cluster(ci);
- }
- }
-}
-
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si = swp_swap_info(entry);
@@ -1612,7 +1618,7 @@ int __swap_count(swp_entry_t entry)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
@@ -1621,7 +1627,7 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
unlock_cluster(ci);
- return count;
+ return !!count;
}
/*
@@ -1707,7 +1713,7 @@ static bool folio_swapped(struct folio *folio)
return false;
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
- return swap_swapcount(si, entry) != 0;
+ return swap_entry_swapped(si, entry);
return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}
@@ -1781,9 +1787,6 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
bool any_only_cache = false;
unsigned long offset;
- if (non_swap_entry(entry))
- return;
-
si = get_swap_device(entry);
if (!si)
return;
@@ -1842,6 +1845,7 @@ out:
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si = swap_type_to_swap_info(type);
+ unsigned long offset;
swp_entry_t entry = {0};
if (!si)
@@ -1849,8 +1853,13 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
- atomic_long_dec(&nr_swap_pages);
+ if (si->flags & SWP_WRITEOK) {
+ offset = cluster_alloc_swap_entry(si, 0, 1);
+ if (offset) {
+ entry = swp_entry(si->type, offset);
+ atomic_long_dec(&nr_swap_pages);
+ }
+ }
put_swap_device(si);
}
fail:
@@ -2611,21 +2620,6 @@ static void reinsert_swap_info(struct swap_info_struct *si)
spin_unlock(&swap_lock);
}
-static bool __has_usable_swap(void)
-{
- return !plist_head_empty(&swap_active_head);
-}
-
-bool has_usable_swap(void)
-{
- bool ret;
-
- spin_lock(&swap_lock);
- ret = __has_usable_swap();
- spin_unlock(&swap_lock);
- return ret;
-}
-
/*
* Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
* see the updated flags, so there will be no more allocations.
@@ -2641,10 +2635,31 @@ static void wait_for_allocation(struct swap_info_struct *si)
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
unlock_cluster(ci);
- offset += SWAPFILE_CLUSTER;
}
}
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_cluster(struct swap_info_struct *si)
+{
+ int cpu, i;
+ struct swap_info_struct **pcp_si;
+
+ for_each_possible_cpu(cpu) {
+ pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
+ /*
+ * Invalidate the percpu swap cluster cache, si->users
+ * is dead, so no new user will point to it, just flush
+ * any existing user.
+ */
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cmpxchg(&pcp_si[i], si, NULL);
+ }
+}
+
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -2717,8 +2732,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
wait_for_allocation(p);
- disable_swap_slots_cache_lock();
-
set_current_oom_origin();
err = try_to_unuse(p->type);
clear_current_oom_origin();
@@ -2726,12 +2739,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (err) {
/* re-insert swap space back into swap_list */
reinsert_swap_info(p);
- reenable_swap_slots_cache_unlock();
goto out_dput;
}
- reenable_swap_slots_cache_unlock();
-
/*
* Wait for swap operations protected by get/put_swap_device()
* to complete. Because of synchronize_rcu() here, all swap
@@ -2746,6 +2756,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
flush_work(&p->discard_work);
flush_work(&p->reclaim_work);
+ flush_percpu_swap_cluster(p);
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
@@ -2773,8 +2784,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
arch_swap_invalidate_area(p->type);
zswap_swapoff(p->type);
mutex_unlock(&swapon_mutex);
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
kfree(p->global_cluster);
p->global_cluster = NULL;
vfree(swap_map);
@@ -3120,13 +3129,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return maxpages;
}
-#define SWAP_CLUSTER_INFO_COLS \
- DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
-#define SWAP_CLUSTER_SPACE_COLS \
- DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
-#define SWAP_CLUSTER_COLS \
- max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-
static int setup_swap_map_and_extents(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned char *swap_map,
@@ -3166,14 +3168,21 @@ static int setup_swap_map_and_extents(struct swap_info_struct *si,
return nr_extents;
}
+#define SWAP_CLUSTER_INFO_COLS \
+ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS \
+ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS \
+ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
+
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
struct swap_cluster_info *cluster_info;
- unsigned long i, j, k, idx;
- int cpu, err = -ENOMEM;
+ unsigned long i, j, idx;
+ int err = -ENOMEM;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3182,20 +3191,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- if (si->flags & SWP_SOLIDSTATE) {
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
-
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
- for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_ENTRY_INVALID;
- local_lock_init(&cluster->lock);
- }
- } else {
+ if (!(si->flags & SWP_SOLIDSTATE)) {
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
GFP_KERNEL);
if (!si->global_cluster)
@@ -3233,8 +3229,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* Reduce false cache line sharing between cluster_info and
* sharing same address space.
*/
- for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
- j = k % SWAP_CLUSTER_COLS;
+ for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
struct swap_cluster_info *ci;
idx = i * SWAP_CLUSTER_COLS + j;
@@ -3449,8 +3444,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
- prio =
- (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ prio = swap_flags & SWAP_FLAG_PRIO_MASK;
enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
@@ -3474,8 +3468,6 @@ free_swap_address_space:
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
- free_percpu(si->percpu_cluster);
- si->percpu_cluster = NULL;
kfree(si->global_cluster);
si->global_cluster = NULL;
inode = NULL;
@@ -3499,8 +3491,6 @@ out:
putname(name);
if (inode)
inode_unlock(inode);
- if (!error)
- enable_swap_slots_cache();
return error;
}
@@ -3527,7 +3517,6 @@ void si_swapinfo(struct sysinfo *val)
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
- * - swp_entry is migration entry -> EINVAL
* - swap-cache reference is requested but there is already one. -> EEXIST
* - swap-cache reference is requested but the entry is not used. -> ENOENT
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
@@ -3542,6 +3531,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
@@ -3787,8 +3780,8 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
- * lock.
+ * Called while __swap_duplicate() or caller of __swap_entry_free_locked()
+ * holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
pgoff_t offset, unsigned char count)
@@ -3893,6 +3886,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static bool __has_usable_swap(void)
+{
+ return !plist_head_empty(&swap_active_head);
+}
+
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
struct swap_info_struct *si, *next;
diff --git a/mm/truncate.c b/mm/truncate.c
index e2e115adfbc5..5d98054094d1 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -78,8 +78,22 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
if (dax_mapping(mapping)) {
for (i = j; i < nr; i++) {
- if (xa_is_value(fbatch->folios[i]))
+ if (xa_is_value(fbatch->folios[i])) {
+ /*
+ * File systems should already have called
+ * dax_break_layout_entry() to remove all DAX
+ * entries while holding a lock to prevent
+ * establishing new entries. Therefore we
+ * shouldn't find any here.
+ */
+ WARN_ON_ONCE(1);
+
+ /*
+ * Delete the mapping so truncate_pagecache()
+ * doesn't loop forever.
+ */
dax_delete_mapping_entry(mapping, indices[i]);
+ }
}
goto out;
}
@@ -178,6 +192,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
unsigned int offset, length;
+ struct page *split_at, *split_at2;
if (pos < start)
offset = start - pos;
@@ -207,8 +222,42 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
- if (split_folio(folio) == 0)
+
+ split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+
+ if (!try_folio_split(folio, split_at, NULL)) {
+ /*
+ * try to split at offset + length to make sure folios within
+ * the range can be dropped, especially to avoid memory waste
+ * for shmem truncate
+ */
+ struct folio *folio2 = page_folio(split_at2);
+
+ if (!folio_try_get(folio2))
+ goto no_split;
+
+ if (!folio_test_large(folio2))
+ goto out;
+
+ if (!folio_trylock(folio2))
+ goto out;
+
+ /*
+ * make sure folio2 is large and does not change its mapping.
+ * Its split result does not matter here.
+ */
+ if (folio_test_large(folio2) &&
+ folio2->mapping == folio->mapping)
+ try_folio_split(folio2, split_at2, NULL);
+
+ folio_unlock(folio2);
+out:
+ folio_put(folio2);
+no_split:
return true;
+ }
if (folio_test_dirty(folio))
return false;
truncate_inode_folio(folio->mapping, folio);
@@ -548,8 +597,6 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- if (folio_test_dirty(folio))
- return 0;
if (folio_mapped(folio))
unmap_mapping_folio(folio);
BUG_ON(folio_mapped(folio));
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 83c164aba6e0..dbdcc43964fb 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,7 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
@@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
-static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+EXPORT_SYMBOL(validate_usercopy_range);
/*
* Validates that the given object is:
@@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- if (static_branch_unlikely(&bypass_usercopy_checks))
- return;
-
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
}
EXPORT_SYMBOL(__check_object_size);
-static bool enable_checks __initdata = true;
+static bool enable_checks __initdata =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);
static int __init parse_hardened_usercopy(char *str)
{
@@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy);
static int __init set_hardened_usercopy(void)
{
- if (enable_checks == false)
- static_branch_enable(&bypass_usercopy_checks);
+ if (enable_checks)
+ static_branch_enable(&validate_usercopy_range);
+ else
+ static_branch_disable(&validate_usercopy_range);
return 1;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af3dfc3633db..fbf2cf62ab9f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -85,14 +86,10 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
mmap_read_lock(mm);
vma = find_vma_and_prepare_anon(mm, address);
if (!IS_ERR(vma)) {
- /*
- * We cannot use vma_start_read() as it may fail due to
- * false locked (see comment in vma_start_read()). We
- * can avoid that by directly locking vm_lock under
- * mmap_lock, which guarantees that nobody can lock the
- * vma for write (vma_start_write()) under us.
- */
- down_read(&vma->vm_lock->lock);
+ bool locked = vma_start_read_locked(vma);
+
+ if (!locked)
+ vma = ERR_PTR(-EAGAIN);
}
mmap_read_unlock(mm);
@@ -1076,16 +1073,14 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
-
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1094,6 +1089,16 @@ static int move_swap_pte(struct mm_struct *mm,
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1141,6 +1146,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -1240,6 +1246,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
+ bool locked;
/*
* Pin the page while holding the lock to be sure the
@@ -1259,14 +1266,28 @@ retry:
goto out;
}
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
+
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
- if (!folio_trylock(src_folio)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1282,8 +1303,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1308,8 +1329,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1322,11 +1343,13 @@ retry:
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1335,9 +1358,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio);
}
out:
@@ -1354,6 +1421,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
@@ -1490,16 +1559,24 @@ static int uffd_move_lock(struct mm_struct *mm,
mmap_read_lock(mm);
err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
- if (!err) {
- /*
- * See comment in uffd_lock_vma() as to why not using
- * vma_start_read() here.
- */
- down_read(&(*dst_vmap)->vm_lock->lock);
- if (*dst_vmap != *src_vmap)
- down_read_nested(&(*src_vmap)->vm_lock->lock,
- SINGLE_DEPTH_NESTING);
+ if (err)
+ goto out;
+
+ if (!vma_start_read_locked(*dst_vmap)) {
+ err = -EAGAIN;
+ goto out;
}
+
+ /* Nothing further to do if both vmas are locked. */
+ if (*dst_vmap == *src_vmap)
+ goto out;
+
+ if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
+ /* Undo dst_vmap locking if src_vmap failed to lock */
+ vma_end_read(*dst_vmap);
+ err = -EAGAIN;
+ }
+out:
mmap_read_unlock(mm);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index b6b9684a1438..448117da071f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,6 +12,7 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
@@ -23,6 +24,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -569,6 +571,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -612,168 +616,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
-{
- /*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- flags |= __GFP_NOWARN;
-
- if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- flags &= ~__GFP_NOFAIL;
- }
-
- return flags;
-}
-
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
-{
- void *ret;
-
- /*
- * It doesn't really make sense to fallback to vmalloc for sub page
- * requests
- */
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
- kmalloc_gfp_adjust(flags, size),
- node);
- if (ret || size <= PAGE_SIZE)
- return ret;
-
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
- /* Don't even allow crazy sizes */
- if (unlikely(size > INT_MAX)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- /*
- * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
- * since the callers already cannot assume anything
- * about the resulting pointer, and cannot play
- * protection games.
- */
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__kvmalloc_node_noprof);
-
-/**
- * kvfree() - Free memory.
- * @addr: Pointer to allocated memory.
- *
- * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
- * It is slightly more efficient to use kfree() or vfree() if you are certain
- * that you know which one to use.
- *
- * Context: Either preemptible task context or not-NMI interrupt.
- */
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-/**
- * kvfree_sensitive - Free a data object containing sensitive information.
- * @addr: address of the data object to be freed.
- * @len: length of the data object.
- *
- * Use the special memzero_explicit() function to clear the content of a
- * kvmalloc'ed object containing sensitive data to make sure that the
- * compiler won't optimize out the data clearing.
- */
-void kvfree_sensitive(const void *addr, size_t len)
-{
- if (likely(!ZERO_OR_NULL_PTR(addr))) {
- memzero_explicit((void *)addr, len);
- kvfree(addr);
- }
-}
-EXPORT_SYMBOL(kvfree_sensitive);
-
-/**
- * kvrealloc - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @flags: the flags for the page level allocator
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
-{
- void *n;
-
- if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
-
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
- if (!n) {
- /* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
- if (!n)
- return NULL;
-
- if (p) {
- /* We already know that `p` is not a vmalloc address. */
- kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
- kasan_enable_current();
-
- kfree(p);
- }
- }
-
- return n;
-}
-EXPORT_SYMBOL(kvrealloc_noprof);
-
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
@@ -906,14 +748,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src)
EXPORT_SYMBOL(folio_mc_copy);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
-int sysctl_overcommit_ratio __read_mostly = 50;
-unsigned long sysctl_overcommit_kbytes __read_mostly;
+static int sysctl_overcommit_ratio __read_mostly = 50;
+static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SYSCTL
+
+static int overcommit_ratio_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -928,8 +772,8 @@ static void sync_overcommit_as(struct work_struct *dummy)
percpu_counter_sync(&vm_committed_as);
}
-int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_policy_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int new_policy = -1;
@@ -964,8 +808,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
-int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -975,6 +819,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
+static const struct ctl_table util_sysctl_table[] = {
+ {
+ .procname = "overcommit_memory",
+ .data = &sysctl_overcommit_memory,
+ .maxlen = sizeof(sysctl_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = overcommit_policy_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "overcommit_ratio",
+ .data = &sysctl_overcommit_ratio,
+ .maxlen = sizeof(sysctl_overcommit_ratio),
+ .mode = 0644,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
+ },
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+};
+
+static int __init init_vm_util_sysctls(void)
+{
+ register_sysctl_init("vm", util_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_vm_util_sysctls);
+#endif /* CONFIG_SYSCTL */
+
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
diff --git a/mm/vma.c b/mm/vma.c
index af1d549b179c..5cdc5612bfc1 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -52,10 +52,9 @@ struct mmap_state {
.pgoff = (map_)->pgoff, \
.file = (map_)->file, \
.prev = (map_)->prev, \
- .vma = vma_, \
+ .middle = vma_, \
.next = (vma_) ? NULL : (map_)->next, \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
@@ -107,29 +106,44 @@ static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
* init_multi_vma_prep() - Initializer for struct vma_prepare
* @vp: The vma_prepare struct
* @vma: The vma that will be altered once locked
- * @next: The next vma if it is to be adjusted
- * @remove: The first vma to be removed
- * @remove2: The second vma to be removed
+ * @vmg: The merge state that will be used to determine adjustment and VMA
+ * removal.
*/
static void init_multi_vma_prep(struct vma_prepare *vp,
struct vm_area_struct *vma,
- struct vm_area_struct *next,
- struct vm_area_struct *remove,
- struct vm_area_struct *remove2)
+ struct vma_merge_struct *vmg)
{
+ struct vm_area_struct *adjust;
+ struct vm_area_struct **remove = &vp->remove;
+
memset(vp, 0, sizeof(struct vma_prepare));
vp->vma = vma;
vp->anon_vma = vma->anon_vma;
- vp->remove = remove;
- vp->remove2 = remove2;
- vp->adj_next = next;
- if (!vp->anon_vma && next)
- vp->anon_vma = next->anon_vma;
+
+ if (vmg && vmg->__remove_middle) {
+ *remove = vmg->middle;
+ remove = &vp->remove2;
+ }
+ if (vmg && vmg->__remove_next)
+ *remove = vmg->next;
+
+ if (vmg && vmg->__adjust_middle_start)
+ adjust = vmg->middle;
+ else if (vmg && vmg->__adjust_next_start)
+ adjust = vmg->next;
+ else
+ adjust = NULL;
+
+ vp->adj_next = adjust;
+ if (!vp->anon_vma && adjust)
+ vp->anon_vma = adjust->anon_vma;
+
+ VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
+ vp->anon_vma != adjust->anon_vma);
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
-
}
/*
@@ -306,7 +320,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- vma_iter_store(vmi, vp->insert);
+ vma_iter_store_new(vmi, vp->insert);
mm->map_count++;
}
@@ -327,7 +341,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->remove) {
again:
- vma_mark_detached(vp->remove, true);
+ vma_mark_detached(vp->remove);
if (vp->file) {
uprobe_munmap(vp->remove, vp->remove->vm_start,
vp->remove->vm_end);
@@ -362,7 +376,7 @@ again:
*/
static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
{
- init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+ init_multi_vma_prep(vp, vma, NULL);
}
/*
@@ -406,17 +420,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma)
{
might_sleep();
vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- if (unreachable)
- __vm_area_free(vma);
- else
- vm_area_free(vma);
+ vm_area_free(vma);
}
/*
@@ -499,7 +510,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
if (new_below) {
vma->vm_start = addr;
@@ -629,49 +640,66 @@ void validate_mm(struct mm_struct *mm)
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
-/* Actually perform the VMA merge operation. */
-static int commit_merge(struct vma_merge_struct *vmg,
- struct vm_area_struct *adjust,
- struct vm_area_struct *remove,
- struct vm_area_struct *remove2,
- long adj_start,
- bool expanded)
+/*
+ * Based on the vmg flag indicating whether we need to adjust the vm_start field
+ * for the middle or next VMA, we calculate what the range of the newly adjusted
+ * VMA ought to be, and set the VMA's range accordingly.
+ */
+static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
{
- struct vma_prepare vp;
+ struct vm_area_struct *adjust;
+ pgoff_t pgoff;
- init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
+ if (vmg->__adjust_middle_start) {
+ adjust = vmg->middle;
+ pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
+ } else if (vmg->__adjust_next_start) {
+ adjust = vmg->next;
+ pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
+ } else {
+ return;
+ }
- VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
- vp.anon_vma != adjust->anon_vma);
+ vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
+}
- if (expanded) {
- /* Note: vma iterator must be pointing to 'start'. */
- vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+/*
+ * Actually perform the VMA merge operation.
+ *
+ * Returns 0 on success, or an error value on failure.
+ */
+static int commit_merge(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma;
+ struct vma_prepare vp;
+
+ if (vmg->__adjust_next_start) {
+ /* We manipulate middle and adjust next, which is the target. */
+ vma = vmg->middle;
+ vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
} else {
- vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
- adjust->vm_end);
+ vma = vmg->target;
+ /* Note: vma iterator must be pointing to 'start'. */
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
}
- if (vma_iter_prealloc(vmg->vmi, vmg->vma))
+ init_multi_vma_prep(&vp, vma, vmg);
+
+ if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
vma_prepare(&vp);
- vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
- vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
-
- if (expanded)
- vma_iter_store(vmg->vmi, vmg->vma);
-
- if (adj_start) {
- adjust->vm_start += adj_start;
- adjust->vm_pgoff += PHYS_PFN(adj_start);
- if (adj_start < 0) {
- WARN_ON(expanded);
- vma_iter_store(vmg->vmi, adjust);
- }
- }
+ /*
+ * THP pages may need to do additional splits if we increase
+ * middle->vm_start.
+ */
+ vma_adjust_trans_huge(vma, vmg->start, vmg->end,
+ vmg->__adjust_middle_start ? vmg->middle : NULL);
+ vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
+ vmg_adjust_set_range(vmg);
+ vma_iter_store_overwrite(vmg->vmi, vmg->target);
- vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
+ vma_complete(&vp, vmg->vmi, vma->vm_mm);
return 0;
}
@@ -694,8 +722,9 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* identical properties.
*
* This function checks for the existence of any such mergeable VMAs and updates
- * the maple tree describing the @vmg->vma->vm_mm address space to account for
- * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ * the maple tree describing the @vmg->middle->vm_mm address space to account
+ * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
+ * merge.
*
* As part of this operation, if a merge occurs, the @vmg object will have its
* vma, start, end, and pgoff fields modified to execute the merge. Subsequent
@@ -704,45 +733,43 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* Returns: The merged VMA if merge succeeds, or NULL otherwise.
*
* ASSUMPTIONS:
- * - The caller must assign the VMA to be modifed to @vmg->vma.
+ * - The caller must assign the VMA to be modifed to @vmg->middle.
* - The caller must have set @vmg->prev to the previous VMA, if there is one.
* - The caller must not set @vmg->next, as we determine this.
* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
- * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
+ * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
*/
static __must_check struct vm_area_struct *vma_merge_existing_range(
struct vma_merge_struct *vmg)
{
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *prev = vmg->prev;
- struct vm_area_struct *next, *res;
+ struct vm_area_struct *next;
struct vm_area_struct *anon_dup = NULL;
- struct vm_area_struct *adjust = NULL;
unsigned long start = vmg->start;
unsigned long end = vmg->end;
- bool left_side = vma && start == vma->vm_start;
- bool right_side = vma && end == vma->vm_end;
+ bool left_side = middle && start == middle->vm_start;
+ bool right_side = middle && end == middle->vm_end;
int err = 0;
- long adj_start = 0;
- bool merge_will_delete_vma, merge_will_delete_next;
bool merge_left, merge_right, merge_both;
- bool expanded;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON_VMG(!vma, vmg); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
VM_WARN_ON_VMG(start >= end, vmg);
/*
- * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+ * If middle == prev, then we are offset into a VMA. Otherwise, if we are
* not, we must span a portion of the VMA.
*/
- VM_WARN_ON_VMG(vma && ((vma != prev && vmg->start != vma->vm_start) ||
- vmg->end > vma->vm_end), vmg);
- /* The vmi must be positioned within vmg->vma. */
- VM_WARN_ON_VMG(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
- vma_iter_addr(vmg->vmi) < vma->vm_end), vmg);
+ VM_WARN_ON_VMG(middle &&
+ ((middle != prev && vmg->start != middle->vm_start) ||
+ vmg->end > middle->vm_end), vmg);
+ /* The vmi must be positioned within vmg->middle. */
+ VM_WARN_ON_VMG(middle &&
+ !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
+ vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -776,49 +803,52 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
merge_both = merge_left && merge_right;
/* If we span the entire VMA, a merge implies it will be deleted. */
- merge_will_delete_vma = left_side && right_side;
+ vmg->__remove_middle = left_side && right_side;
/*
- * If we need to remove vma in its entirety but are unable to do so,
+ * If we need to remove middle in its entirety but are unable to do so,
* we have no sensible recourse but to abort the merge.
*/
- if (merge_will_delete_vma && !can_merge_remove_vma(vma))
+ if (vmg->__remove_middle && !can_merge_remove_vma(middle))
return NULL;
/*
* If we merge both VMAs, then next is also deleted. This implies
* merge_will_delete_vma also.
*/
- merge_will_delete_next = merge_both;
+ vmg->__remove_next = merge_both;
/*
* If we cannot delete next, then we can reduce the operation to merging
- * prev and vma (thereby deleting vma).
+ * prev and middle (thereby deleting middle).
*/
- if (merge_will_delete_next && !can_merge_remove_vma(next)) {
- merge_will_delete_next = false;
+ if (vmg->__remove_next && !can_merge_remove_vma(next)) {
+ vmg->__remove_next = false;
merge_right = false;
merge_both = false;
}
- /* No matter what happens, we will be adjusting vma. */
- vma_start_write(vma);
-
- if (merge_left)
- vma_start_write(prev);
+ /* No matter what happens, we will be adjusting middle. */
+ vma_start_write(middle);
- if (merge_right)
+ if (merge_right) {
vma_start_write(next);
+ vmg->target = next;
+ }
+
+ if (merge_left) {
+ vma_start_write(prev);
+ vmg->target = prev;
+ }
if (merge_both) {
/*
- * |<----->|
- * |-------*********-------|
- * prev vma next
- * extend delete delete
+ * |<-------------------->|
+ * |-------********-------|
+ * prev middle next
+ * extend delete delete
*/
- vmg->vma = prev;
vmg->start = prev->vm_start;
vmg->end = next->vm_end;
vmg->pgoff = prev->vm_pgoff;
@@ -826,80 +856,62 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
/*
* We already ensured anon_vma compatibility above, so now it's
* simply a case of, if prev has no anon_vma object, which of
- * next or vma contains the anon_vma we must duplicate.
+ * next or middle contains the anon_vma we must duplicate.
*/
- err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+ err = dup_anon_vma(prev, next->anon_vma ? next : middle,
+ &anon_dup);
} else if (merge_left) {
/*
- * |<----->| OR
- * |<--------->|
+ * |<------------>| OR
+ * |<----------------->|
* |-------*************
- * prev vma
+ * prev middle
* extend shrink/delete
*/
- vmg->vma = prev;
vmg->start = prev->vm_start;
vmg->pgoff = prev->vm_pgoff;
- if (!merge_will_delete_vma) {
- adjust = vma;
- adj_start = vmg->end - vma->vm_start;
- }
+ if (!vmg->__remove_middle)
+ vmg->__adjust_middle_start = true;
- err = dup_anon_vma(prev, vma, &anon_dup);
+ err = dup_anon_vma(prev, middle, &anon_dup);
} else { /* merge_right */
/*
- * |<----->| OR
- * |<--------->|
+ * |<------------->| OR
+ * |<----------------->|
* *************-------|
- * vma next
+ * middle next
* shrink/delete extend
*/
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
VM_WARN_ON_VMG(!merge_right, vmg);
- /* If we are offset into a VMA, then prev must be vma. */
- VM_WARN_ON_VMG(vmg->start > vma->vm_start && prev && vma != prev, vmg);
+ /* If we are offset into a VMA, then prev must be middle. */
+ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);
- if (merge_will_delete_vma) {
- vmg->vma = next;
+ if (vmg->__remove_middle) {
vmg->end = next->vm_end;
vmg->pgoff = next->vm_pgoff - pglen;
} else {
- /*
- * We shrink vma and expand next.
- *
- * IMPORTANT: This is the ONLY case where the final
- * merged VMA is NOT vmg->vma, but rather vmg->next.
- */
-
- vmg->start = vma->vm_start;
+ /* We shrink middle and expand next. */
+ vmg->__adjust_next_start = true;
+ vmg->start = middle->vm_start;
vmg->end = start;
- vmg->pgoff = vma->vm_pgoff;
-
- adjust = next;
- adj_start = -(vma->vm_end - start);
+ vmg->pgoff = middle->vm_pgoff;
}
- err = dup_anon_vma(next, vma, &anon_dup);
+ err = dup_anon_vma(next, middle, &anon_dup);
}
if (err)
goto abort;
- /*
- * In nearly all cases, we expand vmg->vma. There is one exception -
- * merge_right where we partially span the VMA. In this case we shrink
- * the end of vmg->vma and adjust the start of vmg->next accordingly.
- */
- expanded = !merge_right || merge_will_delete_vma;
+ err = commit_merge(vmg);
+ if (err) {
+ VM_WARN_ON(err != -ENOMEM);
- if (commit_merge(vmg, adjust,
- merge_will_delete_vma ? vma : NULL,
- merge_will_delete_next ? next : NULL,
- adj_start, expanded)) {
if (anon_dup)
unlink_anon_vmas(anon_dup);
@@ -907,11 +919,9 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
return NULL;
}
- res = merge_left ? prev : next;
- khugepaged_enter_vma(res, vmg->flags);
-
+ khugepaged_enter_vma(vmg->target, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
- return res;
+ return vmg->target;
abort:
vma_iter_set(vmg->vmi, start);
@@ -970,10 +980,9 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
struct vm_area_struct *next = vmg->next;
unsigned long end = vmg->end;
bool can_merge_left, can_merge_right;
- bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON_VMG(vmg->vma, vmg);
+ VM_WARN_ON_VMG(vmg->middle, vmg);
/* vmi must point at or before the gap. */
VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
@@ -984,18 +993,18 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
return NULL;
can_merge_left = can_vma_merge_left(vmg);
- can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left);
+ can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
vmg->end = next->vm_end;
- vmg->vma = next;
+ vmg->middle = next;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
if (can_merge_left) {
vmg->start = prev->vm_start;
- vmg->vma = prev;
+ vmg->middle = prev;
vmg->pgoff = prev->vm_pgoff;
/*
@@ -1007,7 +1016,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
vmg->end = end;
/* In expand-only case we are already positioned at prev. */
- if (!just_expand) {
+ if (!vmg->just_expand) {
/* Equivalent to going to the previous range. */
vma_prev(vmg->vmi);
}
@@ -1017,10 +1026,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* Now try to expand adjacent VMA(s). This takes care of removing the
* following VMA if we have VMAs on both sides.
*/
- if (vmg->vma && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->vma, vmg->flags);
+ if (vmg->middle && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->middle, vmg->flags);
vmg->state = VMA_MERGE_SUCCESS;
- return vmg->vma;
+ return vmg->middle;
}
return NULL;
@@ -1032,45 +1041,50 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* @vmg: Describes a VMA expansion operation.
*
* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
- * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
- * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
+ * Will expand over vmg->next if it's different from vmg->middle and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with
* vmg->next needs to be handled by the caller.
*
* Returns: 0 on success.
*
* ASSUMPTIONS:
- * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
- * - The caller must have set @vmg->vma and @vmg->next.
+ * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock.
+ * - The caller must have set @vmg->middle and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *next = vmg->next;
mmap_assert_write_locked(vmg->mm);
- vma_start_write(vma);
- if (next && (vma != next) && (vmg->end == next->vm_end)) {
+ vma_start_write(middle);
+ if (next && (middle != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
- ret = dup_anon_vma(vma, next, &anon_dup);
+ ret = dup_anon_vma(middle, next, &anon_dup);
if (ret)
return ret;
}
/* Not merging but overwriting any part of next is not handled. */
VM_WARN_ON_VMG(next && !remove_next &&
- next != vma && vmg->end > next->vm_start, vmg);
+ next != middle && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON_VMG(vma->vm_start < vmg->start || vma->vm_end > vmg->end, vmg);
+ VM_WARN_ON_VMG(middle->vm_start < vmg->start ||
+ middle->vm_end > vmg->end, vmg);
- if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
+ vmg->target = middle;
+ if (remove_next)
+ vmg->__remove_next = true;
+
+ if (commit_merge(vmg))
goto nomem;
return 0;
@@ -1110,7 +1124,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
+ vma_adjust_trans_huge(vma, start, end, NULL);
vma_iter_clear(vmi);
vma_set_range(vma, start, end, pgoff);
@@ -1199,7 +1213,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, /* unreachable = */ false);
+ remove_vma(vma);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -1221,7 +1235,7 @@ static void reattach_vmas(struct ma_state *mas_detach)
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- vma_mark_detached(vma, false);
+ vma_mark_attached(vma);
__mt_destroy(mas_detach->tree);
}
@@ -1296,7 +1310,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
if (error)
goto munmap_gather_failed;
- vma_mark_detached(next, true);
+ vma_mark_detached(next);
nrpages = vma_pages(next);
vms->nr_pages += nrpages;
@@ -1508,25 +1522,29 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
*/
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
- struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *vma = vmg->middle;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
+ if (vmg_nomem(vmg))
+ return ERR_PTR(-ENOMEM);
/* Split any preceding portion of the VMA. */
- if (vma->vm_start < vmg->start) {
- int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+ if (vma->vm_start < start) {
+ int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
- if (vma->vm_end > vmg->end) {
- int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+ if (vma->vm_end > end) {
+ int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);
@@ -1605,7 +1623,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
vmg.next = vma_iter_next_rewind(vmi, NULL);
- vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
+ vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */
return vma_merge_new_range(&vmg);
}
@@ -1690,7 +1708,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
return -ENOMEM;
vma_start_write(vma);
- vma_iter_store(&vmi, vma);
+ vma_iter_store_new(&vmi, vma);
vma_link_file(vma);
mm->map_count++;
validate_mm(mm);
@@ -1726,7 +1744,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- vmg.vma = NULL; /* New VMA range. */
+ vmg.middle = NULL; /* New VMA range. */
vmg.pgoff = pgoff;
vmg.next = vma_iter_next_rewind(&vmi, NULL);
new_vma = vma_merge_new_range(&vmg);
@@ -2369,7 +2387,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
- vma_iter_store(vmi, vma);
+ vma_iter_store_new(vmi, vma);
map->mm->map_count++;
vma_link_file(vma);
@@ -2377,7 +2395,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
- khugepaged_enter_vma(vma, map->flags);
+ if (!vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma, map->flags);
ksm_add_vma(vma);
*vmap = vma;
return 0;
@@ -2582,7 +2601,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vmg.prev = vma;
/* vmi is positioned at prev, which this mode expects. */
- vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
+ vmg.just_expand = true;
if (vma_merge_new_range(&vmg))
goto out;
@@ -2845,7 +2864,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
+ vma_iter_store_overwrite(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
@@ -2925,7 +2944,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
+ vma_iter_store_overwrite(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
diff --git a/mm/vma.h b/mm/vma.h
index a2e8710b8c47..7356ca5a22d3 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -58,35 +58,85 @@ enum vma_merge_state {
VMA_MERGE_SUCCESS,
};
-enum vma_merge_flags {
- VMG_FLAG_DEFAULT = 0,
- /*
- * If we can expand, simply do so. We know there is nothing to merge to
- * the right. Does not reset state upon failure to merge. The VMA
- * iterator is assumed to be positioned at the previous VMA, rather than
- * at the gap.
- */
- VMG_FLAG_JUST_EXPAND = 1 << 0,
-};
-
-/* Represents a VMA merge operation. */
+/*
+ * Describes a VMA merge operation and is threaded throughout it.
+ *
+ * Any of the fields may be mutated by the merge operation, so no guarantees are
+ * made to the contents of this structure after a merge operation has completed.
+ */
struct vma_merge_struct {
struct mm_struct *mm;
struct vma_iterator *vmi;
- pgoff_t pgoff;
+ /*
+ * Adjacent VMAs, any of which may be NULL if not present:
+ *
+ * |------|--------|------|
+ * | prev | middle | next |
+ * |------|--------|------|
+ *
+ * middle may not yet exist in the case of a proposed new VMA being
+ * merged, or it may be an existing VMA.
+ *
+ * next may be assigned by the caller.
+ */
struct vm_area_struct *prev;
- struct vm_area_struct *next; /* Modified by vma_merge(). */
- struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+ struct vm_area_struct *middle;
+ struct vm_area_struct *next;
+ /* This is the VMA we ultimately target to become the merged VMA. */
+ struct vm_area_struct *target;
+ /*
+ * Initially, the start, end, pgoff fields are provided by the caller
+ * and describe the proposed new VMA range, whether modifying an
+ * existing VMA (which will be 'middle'), or adding a new one.
+ *
+ * During the merge process these fields are updated to describe the new
+ * range _including those VMAs which will be merged_.
+ */
unsigned long start;
unsigned long end;
+ pgoff_t pgoff;
+
unsigned long flags;
struct file *file;
struct anon_vma *anon_vma;
struct mempolicy *policy;
struct vm_userfaultfd_ctx uffd_ctx;
struct anon_vma_name *anon_name;
- enum vma_merge_flags merge_flags;
enum vma_merge_state state;
+
+ /* Flags which callers can use to modify merge behaviour: */
+
+ /*
+ * If we can expand, simply do so. We know there is nothing to merge to
+ * the right. Does not reset state upon failure to merge. The VMA
+ * iterator is assumed to be positioned at the previous VMA, rather than
+ * at the gap.
+ */
+ bool just_expand :1;
+
+ /* Internal flags set during merge process: */
+
+ /*
+ * Internal flag indicating the merge increases vmg->middle->vm_start
+ * (and thereby, vmg->prev->vm_end).
+ */
+ bool __adjust_middle_start :1;
+ /*
+ * Internal flag indicating the merge decreases vmg->next->vm_start
+ * (and thereby, vmg->middle->vm_end).
+ */
+ bool __adjust_next_start :1;
+ /*
+ * Internal flag used during the merge operation to indicate we will
+ * remove vmg->middle.
+ */
+ bool __remove_middle :1;
+ /*
+ * Internal flag used during the merge operationr to indicate we will
+ * remove vmg->next.
+ */
+ bool __remove_next :1;
+
};
static inline bool vmg_nomem(struct vma_merge_struct *vmg)
@@ -110,7 +160,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.flags = flags_, \
.pgoff = pgoff_, \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
@@ -118,8 +167,8 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.mm = vma_->vm_mm, \
.vmi = vmi_, \
.prev = prev_, \
+ .middle = vma_, \
.next = NULL, \
- .vma = vma_, \
.start = start_, \
.end = end_, \
.flags = vma_->vm_flags, \
@@ -130,7 +179,6 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
.uffd_ctx = vma_->vm_userfaultfd_ctx, \
.anon_name = anon_vma_name(vma_), \
.state = VMA_MERGE_START, \
- .merge_flags = VMG_FLAG_DEFAULT, \
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -157,6 +205,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
if (unlikely(mas_is_err(&vmi->mas)))
return -ENOMEM;
+ vma_mark_attached(vma);
return 0;
}
@@ -169,7 +218,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
@@ -364,9 +413,10 @@ static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
}
/* Store a VMA with preallocated memory */
-static inline void vma_iter_store(struct vma_iterator *vmi,
- struct vm_area_struct *vma)
+static inline void vma_iter_store_overwrite(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
{
+ vma_assert_attached(vma);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
@@ -391,6 +441,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
mas_store_prealloc(&vmi->mas, vma);
}
+static inline void vma_iter_store_new(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+ vma_mark_attached(vma);
+ vma_iter_store_overwrite(vmi, vma);
+}
+
static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
return vmi->mas.index;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a6e7acebe9ad..3ed720a787ec 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
@@ -3771,8 +3771,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
struct vm_struct *area;
void *ret;
kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
- unsigned long real_size = size;
- unsigned long real_align = align;
+ unsigned long original_align = align;
unsigned int shift = PAGE_SHIFT;
if (WARN_ON_ONCE(!size))
@@ -3781,7 +3780,7 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
if ((size >> PAGE_SHIFT) > totalram_pages()) {
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, exceeds total pages",
- real_size);
+ size);
return NULL;
}
@@ -3798,19 +3797,18 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
else
shift = arch_vmap_pte_supported_shift(size);
- align = max(real_align, 1UL << shift);
- size = ALIGN(real_size, 1UL << shift);
+ align = max(original_align, 1UL << shift);
}
again:
- area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ area = __get_vm_area_node(size, align, shift, VM_ALLOC |
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed%s",
- real_size, (nofail) ? ". Retrying." : "");
+ size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
@@ -3860,7 +3858,7 @@ again:
(gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
- area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+ area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -3869,17 +3867,15 @@ again:
*/
clear_vm_uninitialized_flag(area);
- size = PAGE_ALIGN(size);
if (!(vm_flags & VM_DEFER_KMEMLEAK))
- kmemleak_vmalloc(area, size, gfp_mask);
+ kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);
return area->addr;
fail:
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
- align = real_align;
- size = real_size;
+ align = original_align;
goto again;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c767d71c43d7..b620d74b0f66 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
}
#endif
+/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
+ * and including the specified highidx
+ * @zone: The current zone in the iterator
+ * @pgdat: The pgdat which node_zones are being iterated
+ * @idx: The index variable
+ * @highidx: The index of the highest zone to return
+ *
+ * This macro iterates through all managed zones up to and including the specified highidx.
+ * The zone iterator enters an invalid state after macro call and must be reinitialized
+ * before it can be used again.
+ */
+#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
+ for ((idx) = 0, (zone) = (pgdat)->node_zones; \
+ (idx) <= (highidx); \
+ (idx)++, (zone)++) \
+ if (!managed_zone(zone)) \
+ continue; \
+ else
+
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
{
@@ -396,13 +415,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
{
unsigned long size = 0;
int zid;
+ struct zone *zone;
- for (zid = 0; zid <= zone_idx; zid++) {
- struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
if (!mem_cgroup_disabled())
size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
else
@@ -441,21 +456,26 @@ void drop_slab(void)
} while ((freed >> shift++) > 1);
}
-static int reclaimer_offset(void)
+#define CHECK_RECLAIMER_OFFSET(type) \
+ do { \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
+ BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
+ PGSCAN_##type - PGSCAN_KSWAPD); \
+ } while (0)
+
+static int reclaimer_offset(struct scan_control *sc)
{
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
- PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+ CHECK_RECLAIMER_OFFSET(DIRECT);
+ CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
+ CHECK_RECLAIMER_OFFSET(PROACTIVE);
if (current_is_kswapd())
return 0;
if (current_is_khugepaged())
return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ if (sc->proactive)
+ return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
@@ -495,7 +515,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
-
+ struct zone *zone;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
@@ -508,12 +528,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
@@ -769,7 +784,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
__delete_from_swap_cache(folio, swap, shadow);
- mem_cgroup_swapout(folio, swap);
+ memcg1_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -1112,6 +1127,13 @@ retry:
if (!folio_trylock(folio))
goto keep;
+ if (folio_contain_hwpoisoned_page(folio)) {
+ unmap_poisoned_folio(folio, folio_pfn(folio), false);
+ folio_unlock(folio);
+ folio_put(folio);
+ continue;
+ }
+
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
nr_pages = folio_nr_pages(folio);
@@ -1279,7 +1301,7 @@ retry:
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (!add_to_swap(folio)) {
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1295,9 +1317,21 @@ retry:
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (!add_to_swap(folio))
+ if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
goto activate_locked_split;
}
+ /*
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
}
@@ -1986,7 +2020,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@@ -2002,10 +2036,10 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
@@ -2372,17 +2406,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
unsigned long total_high_wmark = 0;
unsigned long free, anon;
int z;
+ struct zone *zone;
free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
file = node_page_state(pgdat, NR_ACTIVE_FILE) +
node_page_state(pgdat, NR_INACTIVE_FILE);
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
total_high_wmark += high_wmark_pages(zone);
}
@@ -2400,6 +2430,43 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
}
}
+static inline void calculate_pressure_balance(struct scan_control *sc,
+ int swappiness, u64 *fraction, u64 *denominator)
+{
+ unsigned long anon_cost, file_cost, total_cost;
+ unsigned long ap, fp;
+
+ /*
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
+ *
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
+ *
+ * With swappiness at 100, anon and file have equal IO cost.
+ */
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
+
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
+
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
+
+ fraction[WORKINGSET_ANON] = ap;
+ fraction[WORKINGSET_FILE] = fp;
+ *denominator = ap + fp;
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -2412,12 +2479,10 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- unsigned long anon_cost, file_cost, total_cost;
int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
- unsigned long ap, fp;
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon folios. */
@@ -2466,35 +2531,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
- /*
- * Calculate the pressure balance between anon and file pages.
- *
- * The amount of pressure we put on each LRU is inversely
- * proportional to the cost of reclaiming each list, as
- * determined by the share of pages that are refaulting, times
- * the relative IO cost of bringing back a swapped out
- * anonymous page vs reloading a filesystem page (swappiness).
- *
- * Although we limit that influence to ensure no list gets
- * left behind completely: at least a third of the pressure is
- * applied, before swappiness.
- *
- * With swappiness at 100, anon and file have equal IO cost.
- */
- total_cost = sc->anon_cost + sc->file_cost;
- anon_cost = total_cost + sc->anon_cost;
- file_cost = total_cost + sc->file_cost;
- total_cost = anon_cost + file_cost;
-
- ap = swappiness * (total_cost + 1);
- ap /= anon_cost + 1;
+ calculate_pressure_balance(sc, swappiness, fraction, &denominator);
- fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
- fp /= file_cost + 1;
-
- fraction[0] = ap;
- fraction[1] = fp;
- denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
bool file = is_file_lru(lru);
@@ -4545,7 +4583,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = PGSCAN_KSWAPD + reclaimer_offset();
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
@@ -4695,10 +4733,10 @@ retry:
reset_batch_size(walk);
}
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
- item = PGSTEAL_KSWAPD + reclaimer_offset();
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
__count_memcg_events(memcg, item, reclaimed);
@@ -5843,6 +5881,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
int z;
+ struct zone *zone;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -5862,17 +5901,16 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
return false;
/* If compaction would go ahead or the allocation would succeed, stop */
- for (z = 0; z <= sc->reclaim_idx; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
+ unsigned long watermark = min_wmark_pages(zone);
/* Allocation can already succeed, nothing to do */
- if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ if (zone_watermark_ok(zone, sc->order, watermark,
sc->reclaim_idx, 0))
return false;
- if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
+ if (compaction_suitable(zone, sc->order, watermark,
+ sc->reclaim_idx))
return false;
}
@@ -6099,22 +6137,21 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
sc->reclaim_idx, 0))
return true;
- /* Compaction cannot yet proceed. Do reclaim. */
- if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
- return false;
-
/*
- * Compaction is already possible, but it takes time to run and there
- * are potentially other callers using the pages just freed. So proceed
- * with reclaim to make a buffer of free pages available to give
- * compaction a reasonable chance of completing and allocating the page.
+ * Direct reclaim usually targets the min watermark, but compaction
+ * takes time to run and there are potentially other callers using the
+ * pages just freed. So target a higher buffer to give compaction a
+ * reasonable chance of completing and allocating the pages.
+ *
* Note that we won't actually reclaim the whole buffer in one attempt
* as the target watermark in should_continue_reclaim() is lower. But if
* we are already above the high+gap watermark, don't reclaim at all.
*/
- watermark = high_wmark_pages(zone) + compact_gap(sc->order);
+ watermark = high_wmark_pages(zone);
+ if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
+ return true;
- return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
+ return false;
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
@@ -6393,11 +6430,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= ZONE_NORMAL; i++) {
- zone = &pgdat->node_zones[i];
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
if (!zone_reclaimable_pages(zone))
continue;
@@ -6702,17 +6735,25 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
+
+ /*
+ * In defrag_mode, watermarks must be met in whole
+ * blocks to avoid polluting allocator fallbacks.
+ */
+ if (defrag_mode)
+ free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ else
+ free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
+ if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
+ 0, free_pages))
return true;
}
@@ -6792,11 +6833,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
- for (z = 0; z <= sc->reclaim_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
}
@@ -6827,12 +6864,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
int i;
struct zone *zone;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
if (active)
set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
else
@@ -6893,11 +6925,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= highest_zoneidx; i++) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
+ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
nr_boost_reclaim += zone->watermark_boost;
zone_boosts[i] = zone->watermark_boost;
}
@@ -7404,6 +7432,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7411,6 +7461,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
@@ -7576,11 +7627,11 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
return NODE_RECLAIM_NOSCAN;
ret = __node_reclaim(pgdat, gfp_mask, order);
- clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+ clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16bfe1c694dd..4c268ce39ff2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -31,8 +31,10 @@
#include "internal.h"
+#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NUMA
-int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+#define ENABLE_NUMA_STAT 1
+static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
/* zero numa counters within a zone */
static void zero_zone_numa_counters(struct zone *zone)
@@ -74,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
-int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
+static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -102,6 +104,7 @@ out:
return ret;
}
#endif
+#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1190,6 +1193,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
const char * const vmstat_text[] = {
/* enum zone_stat_item counters */
"nr_free_pages",
+ "nr_free_pages_blocks",
"nr_zone_inactive_anon",
"nr_zone_active_anon",
"nr_zone_inactive_file",
@@ -1273,9 +1277,11 @@ const char * const vmstat_text[] = {
"pgdemote_kswapd",
"pgdemote_direct",
"pgdemote_khugepaged",
+ "pgdemote_proactive",
#ifdef CONFIG_HUGETLB_PAGE
"nr_hugetlb",
#endif
+ "nr_balloon_pages",
/* system-wide enum vm_stat_item counters */
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -1307,9 +1313,11 @@ const char * const vmstat_text[] = {
"pgsteal_kswapd",
"pgsteal_direct",
"pgsteal_khugepaged",
+ "pgsteal_proactive",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_khugepaged",
+ "pgscan_proactive",
"pgscan_direct_throttle",
"pgscan_anon",
"pgscan_file",
@@ -1435,6 +1443,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
+ "direct_map_level2_collapses",
+ "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
"vma_lock_success",
@@ -1938,7 +1948,7 @@ static const struct seq_operations vmstat_op = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
+static int sysctl_stat_interval __read_mostly = HZ;
static int vmstat_late_init_done;
#ifdef CONFIG_PROC_FS
@@ -1947,7 +1957,7 @@ static void refresh_vm_stats(struct work_struct *work)
refresh_cpu_vm_stats(true);
}
-int vmstat_refresh(const struct ctl_table *table, int write,
+static int vmstat_refresh(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
@@ -2196,6 +2206,38 @@ static int __init vmstat_late_init(void)
late_initcall(vmstat_late_init);
#endif
+#ifdef CONFIG_PROC_FS
+static const struct ctl_table vmstat_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+};
+#endif
+
struct workqueue_struct *mm_percpu_wq;
void __init init_mm_internals(void)
@@ -2227,6 +2269,7 @@ void __init init_mm_internals(void)
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+ register_sysctl_init("vm", vmstat_table);
#endif
}
diff --git a/mm/z3fold.c b/mm/z3fold.c
deleted file mode 100644
index 379d24b4fef9..000000000000
--- a/mm/z3fold.c
+++ /dev/null
@@ -1,1447 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * z3fold.c
- *
- * Author: Vitaly Wool <vitaly.wool@konsulko.com>
- * Copyright (C) 2016, Sony Mobile Communications Inc.
- *
- * This implementation is based on zbud written by Seth Jennings.
- *
- * z3fold is an special purpose allocator for storing compressed pages. It
- * can store up to three compressed pages per page which improves the
- * compression ratio of zbud while retaining its main concepts (e. g. always
- * storing an integral number of objects per page) and simplicity.
- * It still has simple and deterministic reclaim properties that make it
- * preferable to a higher density approach (with no requirement on integral
- * number of object per page) when reclaim is used.
- *
- * As in zbud, pages are divided into "chunks". The size of the chunks is
- * fixed at compile time and is determined by NCHUNKS_ORDER below.
- *
- * z3fold doesn't export any API and is meant to be used via zpool API.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/atomic.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/page-flags.h>
-#include <linux/migrate.h>
-#include <linux/node.h>
-#include <linux/compaction.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/zpool.h>
-#include <linux/kmemleak.h>
-
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS)
-
-#define BUDDY_MASK (0x3)
-#define BUDDY_SHIFT 2
-#define SLOTS_ALIGN (0x40)
-
-/*****************
- * Structures
-*****************/
-struct z3fold_pool;
-
-enum buddy {
- HEADLESS = 0,
- FIRST,
- MIDDLE,
- LAST,
- BUDDIES_MAX = LAST
-};
-
-struct z3fold_buddy_slots {
- /*
- * we are using BUDDY_MASK in handle_to_buddy etc. so there should
- * be enough slots to hold all possible variants
- */
- unsigned long slot[BUDDY_MASK + 1];
- unsigned long pool; /* back link */
- rwlock_t lock;
-};
-#define HANDLE_FLAG_MASK (0x03)
-
-/*
- * struct z3fold_header - z3fold page metadata occupying first chunks of each
- * z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the
- * pool
- * @page_lock: per-page lock
- * @refcount: reference count for the z3fold page
- * @work: work_struct for page layout optimization
- * @slots: pointer to the structure holding buddy slots
- * @pool: pointer to the containing pool
- * @cpu: CPU which this page "belongs" to
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- * @first_num: the starting number (for the first handle)
- * @mapped_count: the number of objects currently mapped
- */
-struct z3fold_header {
- struct list_head buddy;
- spinlock_t page_lock;
- struct kref refcount;
- struct work_struct work;
- struct z3fold_buddy_slots *slots;
- struct z3fold_pool *pool;
- short cpu;
- unsigned short first_chunks;
- unsigned short middle_chunks;
- unsigned short last_chunks;
- unsigned short start_middle;
- unsigned short first_num:2;
- unsigned short mapped_count:2;
- unsigned short foreign_handles:2;
-};
-
-/**
- * struct z3fold_pool - stores metadata for each z3fold pool
- * @name: pool name
- * @lock: protects pool unbuddied lists
- * @stale_lock: protects pool stale page list
- * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
- * buddies; the list each z3fold page is added to depends on
- * the size of its free region.
- * @stale: list of pages marked for freeing
- * @pages_nr: number of z3fold pages in the pool.
- * @c_handle: cache for z3fold_buddy_slots allocation
- * @compact_wq: workqueue for page layout background optimization
- * @release_wq: workqueue for safe page release
- * @work: work_struct for safe page release
- *
- * This structure is allocated at pool creation time and maintains metadata
- * pertaining to a particular z3fold pool.
- */
-struct z3fold_pool {
- const char *name;
- spinlock_t lock;
- spinlock_t stale_lock;
- struct list_head __percpu *unbuddied;
- struct list_head stale;
- atomic64_t pages_nr;
- struct kmem_cache *c_handle;
- struct workqueue_struct *compact_wq;
- struct workqueue_struct *release_wq;
- struct work_struct work;
-};
-
-/*
- * Internal z3fold page flags
- */
-enum z3fold_page_flags {
- PAGE_HEADLESS = 0,
- MIDDLE_CHUNK_MAPPED,
- NEEDS_COMPACTING,
- PAGE_STALE,
- PAGE_CLAIMED, /* by either reclaim or free */
- PAGE_MIGRATED, /* page is migrated and soon to be released */
-};
-
-/*
- * handle flags, go under HANDLE_FLAG_MASK
- */
-enum z3fold_handle_flags {
- HANDLES_NOFREE = 0,
-};
-
-/*
- * Forward declarations
- */
-static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
-static void compact_page_work(struct work_struct *w);
-
-/*****************
- * Helpers
-*****************/
-
-/* Converts an allocation size in bytes to size in z3fold chunks */
-static int size_to_chunks(size_t size)
-{
- return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-#define for_each_unbuddied_list(_iter, _begin) \
- for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-
-static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
- gfp_t gfp)
-{
- struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
- gfp);
-
- if (slots) {
- /* It will be freed separately in free_handle(). */
- kmemleak_not_leak(slots);
- slots->pool = (unsigned long)pool;
- rwlock_init(&slots->lock);
- }
-
- return slots;
-}
-
-static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
-{
- return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
-}
-
-static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
-{
- return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
-}
-
-/* Lock a z3fold page */
-static inline void z3fold_page_lock(struct z3fold_header *zhdr)
-{
- spin_lock(&zhdr->page_lock);
-}
-
-/* Try to lock a z3fold page */
-static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
-{
- return spin_trylock(&zhdr->page_lock);
-}
-
-/* Unlock a z3fold page */
-static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
-{
- spin_unlock(&zhdr->page_lock);
-}
-
-/* return locked z3fold page if it's not headless */
-static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
-{
- struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
- int locked = 0;
-
- if (!(handle & (1 << PAGE_HEADLESS))) {
- slots = handle_to_slots(handle);
- do {
- unsigned long addr;
-
- read_lock(&slots->lock);
- addr = *(unsigned long *)handle;
- zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- locked = z3fold_page_trylock(zhdr);
- read_unlock(&slots->lock);
- if (locked) {
- struct page *page = virt_to_page(zhdr);
-
- if (!test_bit(PAGE_MIGRATED, &page->private))
- break;
- z3fold_page_unlock(zhdr);
- }
- cpu_relax();
- } while (true);
- } else {
- zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
- }
-
- return zhdr;
-}
-
-static inline void put_z3fold_header(struct z3fold_header *zhdr)
-{
- struct page *page = virt_to_page(zhdr);
-
- if (!test_bit(PAGE_HEADLESS, &page->private))
- z3fold_page_unlock(zhdr);
-}
-
-static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
-{
- struct z3fold_buddy_slots *slots;
- int i;
- bool is_free;
-
- if (WARN_ON(*(unsigned long *)handle == 0))
- return;
-
- slots = handle_to_slots(handle);
- write_lock(&slots->lock);
- *(unsigned long *)handle = 0;
-
- if (test_bit(HANDLES_NOFREE, &slots->pool)) {
- write_unlock(&slots->lock);
- return; /* simple case, nothing else to do */
- }
-
- if (zhdr->slots != slots)
- zhdr->foreign_handles--;
-
- is_free = true;
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- write_unlock(&slots->lock);
-
- if (is_free) {
- struct z3fold_pool *pool = slots_to_pool(slots);
-
- if (zhdr->slots == slots)
- zhdr->slots = NULL;
- kmem_cache_free(pool->c_handle, slots);
- }
-}
-
-/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
- struct z3fold_pool *pool, gfp_t gfp)
-{
- struct z3fold_header *zhdr = page_address(page);
- struct z3fold_buddy_slots *slots;
-
- clear_bit(PAGE_HEADLESS, &page->private);
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- clear_bit(NEEDS_COMPACTING, &page->private);
- clear_bit(PAGE_STALE, &page->private);
- clear_bit(PAGE_CLAIMED, &page->private);
- clear_bit(PAGE_MIGRATED, &page->private);
- if (headless)
- return zhdr;
-
- slots = alloc_slots(pool, gfp);
- if (!slots)
- return NULL;
-
- memset(zhdr, 0, sizeof(*zhdr));
- spin_lock_init(&zhdr->page_lock);
- kref_init(&zhdr->refcount);
- zhdr->cpu = -1;
- zhdr->slots = slots;
- zhdr->pool = pool;
- INIT_LIST_HEAD(&zhdr->buddy);
- INIT_WORK(&zhdr->work, compact_page_work);
- return zhdr;
-}
-
-/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page, bool headless)
-{
- if (!headless) {
- lock_page(page);
- __ClearPageMovable(page);
- unlock_page(page);
- }
- __free_page(page);
-}
-
-/* Helper function to build the index */
-static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
-{
- return (bud + zhdr->first_num) & BUDDY_MASK;
-}
-
-/*
- * Encodes the handle of a particular buddy within a z3fold page.
- * Zhdr->page_lock should be held as this function accesses first_num
- * if bud != HEADLESS.
- */
-static unsigned long __encode_handle(struct z3fold_header *zhdr,
- struct z3fold_buddy_slots *slots,
- enum buddy bud)
-{
- unsigned long h = (unsigned long)zhdr;
- int idx = 0;
-
- /*
- * For a headless page, its handle is its pointer with the extra
- * PAGE_HEADLESS bit set
- */
- if (bud == HEADLESS)
- return h | (1 << PAGE_HEADLESS);
-
- /* otherwise, return pointer to encoded handle */
- idx = __idx(zhdr, bud);
- h += idx;
- if (bud == LAST)
- h |= (zhdr->last_chunks << BUDDY_SHIFT);
-
- write_lock(&slots->lock);
- slots->slot[idx] = h;
- write_unlock(&slots->lock);
- return (unsigned long)&slots->slot[idx];
-}
-
-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
-{
- return __encode_handle(zhdr, zhdr->slots, bud);
-}
-
-/* only for LAST bud, returns zero otherwise */
-static unsigned short handle_to_chunks(unsigned long handle)
-{
- struct z3fold_buddy_slots *slots = handle_to_slots(handle);
- unsigned long addr;
-
- read_lock(&slots->lock);
- addr = *(unsigned long *)handle;
- read_unlock(&slots->lock);
- return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
-}
-
-/*
- * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
- * but that doesn't matter. because the masking will result in the
- * correct buddy number.
- */
-static enum buddy handle_to_buddy(unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct z3fold_buddy_slots *slots = handle_to_slots(handle);
- unsigned long addr;
-
- read_lock(&slots->lock);
- WARN_ON(handle & (1 << PAGE_HEADLESS));
- addr = *(unsigned long *)handle;
- read_unlock(&slots->lock);
- zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- return (addr - zhdr->first_num) & BUDDY_MASK;
-}
-
-static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
-{
- return zhdr->pool;
-}
-
-static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
-{
- struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
-
- WARN_ON(!list_empty(&zhdr->buddy));
- set_bit(PAGE_STALE, &page->private);
- clear_bit(NEEDS_COMPACTING, &page->private);
- spin_lock(&pool->lock);
- spin_unlock(&pool->lock);
-
- if (locked)
- z3fold_page_unlock(zhdr);
-
- spin_lock(&pool->stale_lock);
- list_add(&zhdr->buddy, &pool->stale);
- queue_work(pool->release_wq, &pool->work);
- spin_unlock(&pool->stale_lock);
-
- atomic64_dec(&pool->pages_nr);
-}
-
-static void release_z3fold_page_locked(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- WARN_ON(z3fold_page_trylock(zhdr));
- __release_z3fold_page(zhdr, true);
-}
-
-static void release_z3fold_page_locked_list(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
-
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- WARN_ON(z3fold_page_trylock(zhdr));
- __release_z3fold_page(zhdr, true);
-}
-
-static inline int put_z3fold_locked(struct z3fold_header *zhdr)
-{
- return kref_put(&zhdr->refcount, release_z3fold_page_locked);
-}
-
-static inline int put_z3fold_locked_list(struct z3fold_header *zhdr)
-{
- return kref_put(&zhdr->refcount, release_z3fold_page_locked_list);
-}
-
-static void free_pages_work(struct work_struct *w)
-{
- struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
-
- spin_lock(&pool->stale_lock);
- while (!list_empty(&pool->stale)) {
- struct z3fold_header *zhdr = list_first_entry(&pool->stale,
- struct z3fold_header, buddy);
- struct page *page = virt_to_page(zhdr);
-
- list_del(&zhdr->buddy);
- if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
- continue;
- spin_unlock(&pool->stale_lock);
- cancel_work_sync(&zhdr->work);
- free_z3fold_page(page, false);
- cond_resched();
- spin_lock(&pool->stale_lock);
- }
- spin_unlock(&pool->stale_lock);
-}
-
-/*
- * Returns the number of free chunks in a z3fold page.
- * NB: can't be used with HEADLESS pages.
- */
-static int num_free_chunks(struct z3fold_header *zhdr)
-{
- int nfree;
- /*
- * If there is a middle object, pick up the bigger free space
- * either before or after it. Otherwise just subtract the number
- * of chunks occupied by the first and the last objects.
- */
- if (zhdr->middle_chunks != 0) {
- int nfree_before = zhdr->first_chunks ?
- 0 : zhdr->start_middle - ZHDR_CHUNKS;
- int nfree_after = zhdr->last_chunks ?
- 0 : TOTAL_CHUNKS -
- (zhdr->start_middle + zhdr->middle_chunks);
- nfree = max(nfree_before, nfree_after);
- } else
- nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
- return nfree;
-}
-
-/* Add to the appropriate unbuddied list */
-static inline void add_to_unbuddied(struct z3fold_pool *pool,
- struct z3fold_header *zhdr)
-{
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied;
- int freechunks = num_free_chunks(zhdr);
-
- migrate_disable();
- unbuddied = this_cpu_ptr(pool->unbuddied);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- migrate_enable();
- }
-}
-
-static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
-{
- enum buddy bud = HEADLESS;
-
- if (zhdr->middle_chunks) {
- if (!zhdr->first_chunks &&
- chunks <= zhdr->start_middle - ZHDR_CHUNKS)
- bud = FIRST;
- else if (!zhdr->last_chunks)
- bud = LAST;
- } else {
- if (!zhdr->first_chunks)
- bud = FIRST;
- else if (!zhdr->last_chunks)
- bud = LAST;
- else
- bud = MIDDLE;
- }
-
- return bud;
-}
-
-static inline void *mchunk_memmove(struct z3fold_header *zhdr,
- unsigned short dst_chunk)
-{
- void *beg = zhdr;
- return memmove(beg + (dst_chunk << CHUNK_SHIFT),
- beg + (zhdr->start_middle << CHUNK_SHIFT),
- zhdr->middle_chunks << CHUNK_SHIFT);
-}
-
-static inline bool buddy_single(struct z3fold_header *zhdr)
-{
- return !((zhdr->first_chunks && zhdr->middle_chunks) ||
- (zhdr->first_chunks && zhdr->last_chunks) ||
- (zhdr->middle_chunks && zhdr->last_chunks));
-}
-
-static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
-{
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- void *p = zhdr;
- unsigned long old_handle = 0;
- size_t sz = 0;
- struct z3fold_header *new_zhdr = NULL;
- int first_idx = __idx(zhdr, FIRST);
- int middle_idx = __idx(zhdr, MIDDLE);
- int last_idx = __idx(zhdr, LAST);
- unsigned short *moved_chunks = NULL;
-
- /*
- * No need to protect slots here -- all the slots are "local" and
- * the page lock is already taken
- */
- if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
- p += ZHDR_SIZE_ALIGNED;
- sz = zhdr->first_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
- moved_chunks = &zhdr->first_chunks;
- } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
- p += zhdr->start_middle << CHUNK_SHIFT;
- sz = zhdr->middle_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
- moved_chunks = &zhdr->middle_chunks;
- } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
- p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
- sz = zhdr->last_chunks << CHUNK_SHIFT;
- old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
- moved_chunks = &zhdr->last_chunks;
- }
-
- if (sz > 0) {
- enum buddy new_bud = HEADLESS;
- short chunks = size_to_chunks(sz);
- void *q;
-
- new_zhdr = __z3fold_alloc(pool, sz, false);
- if (!new_zhdr)
- return NULL;
-
- if (WARN_ON(new_zhdr == zhdr))
- goto out_fail;
-
- new_bud = get_free_buddy(new_zhdr, chunks);
- q = new_zhdr;
- switch (new_bud) {
- case FIRST:
- new_zhdr->first_chunks = chunks;
- q += ZHDR_SIZE_ALIGNED;
- break;
- case MIDDLE:
- new_zhdr->middle_chunks = chunks;
- new_zhdr->start_middle =
- new_zhdr->first_chunks + ZHDR_CHUNKS;
- q += new_zhdr->start_middle << CHUNK_SHIFT;
- break;
- case LAST:
- new_zhdr->last_chunks = chunks;
- q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
- break;
- default:
- goto out_fail;
- }
- new_zhdr->foreign_handles++;
- memcpy(q, p, sz);
- write_lock(&zhdr->slots->lock);
- *(unsigned long *)old_handle = (unsigned long)new_zhdr +
- __idx(new_zhdr, new_bud);
- if (new_bud == LAST)
- *(unsigned long *)old_handle |=
- (new_zhdr->last_chunks << BUDDY_SHIFT);
- write_unlock(&zhdr->slots->lock);
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
-
- *moved_chunks = 0;
- }
-
- return new_zhdr;
-
-out_fail:
- if (new_zhdr && !put_z3fold_locked(new_zhdr)) {
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
- }
- return NULL;
-
-}
-
-#define BIG_CHUNK_GAP 3
-/* Has to be called with lock held */
-static int z3fold_compact_page(struct z3fold_header *zhdr)
-{
- struct page *page = virt_to_page(zhdr);
-
- if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
- return 0; /* can't move middle chunk, it's used */
-
- if (unlikely(PageIsolated(page)))
- return 0;
-
- if (zhdr->middle_chunks == 0)
- return 0; /* nothing to compact */
-
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /* move to the beginning */
- mchunk_memmove(zhdr, ZHDR_CHUNKS);
- zhdr->first_chunks = zhdr->middle_chunks;
- zhdr->middle_chunks = 0;
- zhdr->start_middle = 0;
- zhdr->first_num++;
- return 1;
- }
-
- /*
- * moving data is expensive, so let's only do that if
- * there's substantial gain (at least BIG_CHUNK_GAP chunks)
- */
- if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
- zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
- BIG_CHUNK_GAP) {
- mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
- zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
- return 1;
- } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
- TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
- + zhdr->middle_chunks) >=
- BIG_CHUNK_GAP) {
- unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
- zhdr->middle_chunks;
- mchunk_memmove(zhdr, new_start);
- zhdr->start_middle = new_start;
- return 1;
- }
-
- return 0;
-}
-
-static void do_compact_page(struct z3fold_header *zhdr, bool locked)
-{
- struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- struct page *page;
-
- page = virt_to_page(zhdr);
- if (locked)
- WARN_ON(z3fold_page_trylock(zhdr));
- else
- z3fold_page_lock(zhdr);
- if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
- z3fold_page_unlock(zhdr);
- return;
- }
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- if (put_z3fold_locked(zhdr))
- return;
-
- if (test_bit(PAGE_STALE, &page->private) ||
- test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- return;
- }
-
- if (!zhdr->foreign_handles && buddy_single(zhdr) &&
- zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (!put_z3fold_locked(zhdr)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- }
- return;
- }
-
- z3fold_compact_page(zhdr);
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
-}
-
-static void compact_page_work(struct work_struct *w)
-{
- struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
- work);
-
- do_compact_page(zhdr, false);
-}
-
-/* returns _locked_ z3fold page header or NULL */
-static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
- size_t size, bool can_sleep)
-{
- struct z3fold_header *zhdr = NULL;
- struct page *page;
- struct list_head *unbuddied;
- int chunks = size_to_chunks(size), i;
-
-lookup:
- migrate_disable();
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = this_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- zhdr = NULL;
- migrate_enable();
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- migrate_enable();
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- migrate_enable();
-
- if (!zhdr) {
- int cpu;
-
- /* look for _exact_ match on other cpus' lists */
- for_each_online_cpu(cpu) {
- struct list_head *l;
-
- unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
- spin_lock(&pool->lock);
- l = &unbuddied[chunks];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr || !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- zhdr = NULL;
- continue;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- if (can_sleep)
- cond_resched();
- continue;
- }
- kref_get(&zhdr->refcount);
- break;
- }
- }
-
- if (zhdr && !zhdr->slots) {
- zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
- if (!zhdr->slots)
- goto out_fail;
- }
- return zhdr;
-
-out_fail:
- if (!put_z3fold_locked(zhdr)) {
- add_to_unbuddied(pool, zhdr);
- z3fold_page_unlock(zhdr);
- }
- return NULL;
-}
-
-/*
- * API Functions
- */
-
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @name: pool name
- * @gfp: gfp flags when allocating the z3fold pool structure
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
-{
- struct z3fold_pool *pool = NULL;
- int i, cpu;
-
- pool = kzalloc(sizeof(struct z3fold_pool), gfp);
- if (!pool)
- goto out;
- pool->c_handle = kmem_cache_create("z3fold_handle",
- sizeof(struct z3fold_buddy_slots),
- SLOTS_ALIGN, 0, NULL);
- if (!pool->c_handle)
- goto out_c;
- spin_lock_init(&pool->lock);
- spin_lock_init(&pool->stale_lock);
- pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
- __alignof__(struct list_head));
- if (!pool->unbuddied)
- goto out_pool;
- for_each_possible_cpu(cpu) {
- struct list_head *unbuddied =
- per_cpu_ptr(pool->unbuddied, cpu);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&unbuddied[i]);
- }
- INIT_LIST_HEAD(&pool->stale);
- atomic64_set(&pool->pages_nr, 0);
- pool->name = name;
- pool->compact_wq = create_singlethread_workqueue(pool->name);
- if (!pool->compact_wq)
- goto out_unbuddied;
- pool->release_wq = create_singlethread_workqueue(pool->name);
- if (!pool->release_wq)
- goto out_wq;
- INIT_WORK(&pool->work, free_pages_work);
- return pool;
-
-out_wq:
- destroy_workqueue(pool->compact_wq);
-out_unbuddied:
- free_percpu(pool->unbuddied);
-out_pool:
- kmem_cache_destroy(pool->c_handle);
-out_c:
- kfree(pool);
-out:
- return NULL;
-}
-
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool: the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
- kmem_cache_destroy(pool->c_handle);
-
- /*
- * We need to destroy pool->compact_wq before pool->release_wq,
- * as any pending work on pool->compact_wq will call
- * queue_work(pool->release_wq, &pool->work).
- *
- * There are still outstanding pages until both workqueues are drained,
- * so we cannot unregister migration until then.
- */
-
- destroy_workqueue(pool->compact_wq);
- destroy_workqueue(pool->release_wq);
- free_percpu(pool->unbuddied);
- kfree(pool);
-}
-
-static const struct movable_operations z3fold_mops;
-
-/**
- * z3fold_alloc() - allocates a region of a given size
- * @pool: z3fold pool from which to allocate
- * @size: size in bytes of the desired allocation
- * @gfp: gfp flags used if the pool needs to grow
- * @handle: handle of the new allocation
- *
- * This function will attempt to find a free region in the pool large enough to
- * satisfy the allocation request. A search of the unbuddied lists is
- * performed first. If no suitable free region is found, then a new page is
- * allocated and added to the pool to satisfy the request.
- *
- * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
- * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
- * a new page.
- */
-static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- int chunks = size_to_chunks(size);
- struct z3fold_header *zhdr = NULL;
- struct page *page = NULL;
- enum buddy bud;
- bool can_sleep = gfpflags_allow_blocking(gfp);
-
- if (!size || (gfp & __GFP_HIGHMEM))
- return -EINVAL;
-
- if (size > PAGE_SIZE)
- return -ENOSPC;
-
- if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
- bud = HEADLESS;
- else {
-retry:
- zhdr = __z3fold_alloc(pool, size, can_sleep);
- if (zhdr) {
- bud = get_free_buddy(zhdr, chunks);
- if (bud == HEADLESS) {
- if (!put_z3fold_locked(zhdr))
- z3fold_page_unlock(zhdr);
- pr_err("No free chunks in unbuddied\n");
- WARN_ON(1);
- goto retry;
- }
- page = virt_to_page(zhdr);
- goto found;
- }
- bud = FIRST;
- }
-
- page = alloc_page(gfp);
- if (!page)
- return -ENOMEM;
-
- zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
- if (!zhdr) {
- __free_page(page);
- return -ENOMEM;
- }
- atomic64_inc(&pool->pages_nr);
-
- if (bud == HEADLESS) {
- set_bit(PAGE_HEADLESS, &page->private);
- goto headless;
- }
- if (can_sleep) {
- lock_page(page);
- __SetPageMovable(page, &z3fold_mops);
- unlock_page(page);
- } else {
- WARN_ON(!trylock_page(page));
- __SetPageMovable(page, &z3fold_mops);
- unlock_page(page);
- }
- z3fold_page_lock(zhdr);
-
-found:
- if (bud == FIRST)
- zhdr->first_chunks = chunks;
- else if (bud == LAST)
- zhdr->last_chunks = chunks;
- else {
- zhdr->middle_chunks = chunks;
- zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
- }
- add_to_unbuddied(pool, zhdr);
-
-headless:
- spin_lock(&pool->lock);
- *handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
- if (bud != HEADLESS)
- z3fold_page_unlock(zhdr);
-
- return 0;
-}
-
-/**
- * z3fold_free() - frees the allocation associated with the given handle
- * @pool: pool in which the allocation resided
- * @handle: handle associated with the allocation returned by z3fold_alloc()
- *
- * In the case that the z3fold page in which the allocation resides is under
- * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
- * only sets the first|middle|last_chunks to 0. The page is actually freed
- * once all buddies are evicted (see z3fold_reclaim_page() below).
- */
-static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- enum buddy bud;
- bool page_claimed;
-
- zhdr = get_z3fold_header(handle);
- page = virt_to_page(zhdr);
- page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
-
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- /* if a headless page is under reclaim, just leave.
- * NB: we use test_and_set_bit for a reason: if the bit
- * has not been set before, we release this page
- * immediately so we don't care about its value any more.
- */
- if (!page_claimed) {
- put_z3fold_header(zhdr);
- free_z3fold_page(page, true);
- atomic64_dec(&pool->pages_nr);
- }
- return;
- }
-
- /* Non-headless case */
- bud = handle_to_buddy(handle);
-
- switch (bud) {
- case FIRST:
- zhdr->first_chunks = 0;
- break;
- case MIDDLE:
- zhdr->middle_chunks = 0;
- break;
- case LAST:
- zhdr->last_chunks = 0;
- break;
- default:
- pr_err("%s: unknown bud %d\n", __func__, bud);
- WARN_ON(1);
- put_z3fold_header(zhdr);
- return;
- }
-
- if (!page_claimed)
- free_handle(handle, zhdr);
- if (put_z3fold_locked_list(zhdr))
- return;
- if (page_claimed) {
- /* the page has not been claimed by us */
- put_z3fold_header(zhdr);
- return;
- }
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- clear_bit(PAGE_CLAIMED, &page->private);
- put_z3fold_header(zhdr);
- return;
- }
- if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
- zhdr->cpu = -1;
- kref_get(&zhdr->refcount);
- clear_bit(PAGE_CLAIMED, &page->private);
- do_compact_page(zhdr, true);
- return;
- }
- kref_get(&zhdr->refcount);
- clear_bit(PAGE_CLAIMED, &page->private);
- queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
- put_z3fold_header(zhdr);
-}
-
-/**
- * z3fold_map() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be mapped
- *
- * Extracts the buddy number from handle and constructs the pointer to the
- * correct starting chunk within the page.
- *
- * Returns: a pointer to the mapped allocation
- */
-static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- void *addr;
- enum buddy buddy;
-
- zhdr = get_z3fold_header(handle);
- addr = zhdr;
- page = virt_to_page(zhdr);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- goto out;
-
- buddy = handle_to_buddy(handle);
- switch (buddy) {
- case FIRST:
- addr += ZHDR_SIZE_ALIGNED;
- break;
- case MIDDLE:
- addr += zhdr->start_middle << CHUNK_SHIFT;
- set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- break;
- case LAST:
- addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
- break;
- default:
- pr_err("unknown buddy id %d\n", buddy);
- WARN_ON(1);
- addr = NULL;
- break;
- }
-
- if (addr)
- zhdr->mapped_count++;
-out:
- put_z3fold_header(zhdr);
- return addr;
-}
-
-/**
- * z3fold_unmap() - unmaps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be unmapped
- */
-static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
-{
- struct z3fold_header *zhdr;
- struct page *page;
- enum buddy buddy;
-
- zhdr = get_z3fold_header(handle);
- page = virt_to_page(zhdr);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- return;
-
- buddy = handle_to_buddy(handle);
- if (buddy == MIDDLE)
- clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
- zhdr->mapped_count--;
- put_z3fold_header(zhdr);
-}
-
-/**
- * z3fold_get_pool_pages() - gets the z3fold pool size in pages
- * @pool: pool whose size is being queried
- *
- * Returns: size in pages of the given pool.
- */
-static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
-{
- return atomic64_read(&pool->pages_nr);
-}
-
-static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
-{
- struct z3fold_header *zhdr;
- struct z3fold_pool *pool;
-
- VM_BUG_ON_PAGE(PageIsolated(page), page);
-
- if (test_bit(PAGE_HEADLESS, &page->private))
- return false;
-
- zhdr = page_address(page);
- z3fold_page_lock(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private) ||
- test_bit(PAGE_STALE, &page->private))
- goto out;
-
- if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
- goto out;
-
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
- goto out;
- pool = zhdr_to_pool(zhdr);
- spin_lock(&pool->lock);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
-
- kref_get(&zhdr->refcount);
- z3fold_page_unlock(zhdr);
- return true;
-
-out:
- z3fold_page_unlock(zhdr);
- return false;
-}
-
-static int z3fold_page_migrate(struct page *newpage, struct page *page,
- enum migrate_mode mode)
-{
- struct z3fold_header *zhdr, *new_zhdr;
- struct z3fold_pool *pool;
-
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-
- zhdr = page_address(page);
- pool = zhdr_to_pool(zhdr);
-
- if (!z3fold_page_trylock(zhdr))
- return -EAGAIN;
- if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- return -EBUSY;
- }
- if (work_pending(&zhdr->work)) {
- z3fold_page_unlock(zhdr);
- return -EAGAIN;
- }
- new_zhdr = page_address(newpage);
- memcpy(new_zhdr, zhdr, PAGE_SIZE);
- newpage->private = page->private;
- set_bit(PAGE_MIGRATED, &page->private);
- z3fold_page_unlock(zhdr);
- spin_lock_init(&new_zhdr->page_lock);
- INIT_WORK(&new_zhdr->work, compact_page_work);
- /*
- * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
- * so we only have to reinitialize it.
- */
- INIT_LIST_HEAD(&new_zhdr->buddy);
- __ClearPageMovable(page);
-
- get_page(newpage);
- z3fold_page_lock(new_zhdr);
- if (new_zhdr->first_chunks)
- encode_handle(new_zhdr, FIRST);
- if (new_zhdr->last_chunks)
- encode_handle(new_zhdr, LAST);
- if (new_zhdr->middle_chunks)
- encode_handle(new_zhdr, MIDDLE);
- set_bit(NEEDS_COMPACTING, &newpage->private);
- new_zhdr->cpu = smp_processor_id();
- __SetPageMovable(newpage, &z3fold_mops);
- z3fold_page_unlock(new_zhdr);
-
- queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
-
- /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
- page->private = 0;
- put_page(page);
- return 0;
-}
-
-static void z3fold_page_putback(struct page *page)
-{
- struct z3fold_header *zhdr;
- struct z3fold_pool *pool;
-
- zhdr = page_address(page);
- pool = zhdr_to_pool(zhdr);
-
- z3fold_page_lock(zhdr);
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- INIT_LIST_HEAD(&page->lru);
- if (put_z3fold_locked(zhdr))
- return;
- if (list_empty(&zhdr->buddy))
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
-}
-
-static const struct movable_operations z3fold_mops = {
- .isolate_page = z3fold_page_isolate,
- .migrate_page = z3fold_page_migrate,
- .putback_page = z3fold_page_putback,
-};
-
-/*****************
- * zpool
- ****************/
-
-static void *z3fold_zpool_create(const char *name, gfp_t gfp)
-{
- return z3fold_create_pool(name, gfp);
-}
-
-static void z3fold_zpool_destroy(void *pool)
-{
- z3fold_destroy_pool(pool);
-}
-
-static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return z3fold_alloc(pool, size, gfp, handle);
-}
-static void z3fold_zpool_free(void *pool, unsigned long handle)
-{
- z3fold_free(pool, handle);
-}
-
-static void *z3fold_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return z3fold_map(pool, handle);
-}
-static void z3fold_zpool_unmap(void *pool, unsigned long handle)
-{
- z3fold_unmap(pool, handle);
-}
-
-static u64 z3fold_zpool_total_pages(void *pool)
-{
- return z3fold_get_pool_pages(pool);
-}
-
-static struct zpool_driver z3fold_zpool_driver = {
- .type = "z3fold",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = z3fold_zpool_create,
- .destroy = z3fold_zpool_destroy,
- .malloc = z3fold_zpool_malloc,
- .free = z3fold_zpool_free,
- .map = z3fold_zpool_map,
- .unmap = z3fold_zpool_unmap,
- .total_pages = z3fold_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-z3fold");
-
-static int __init init_z3fold(void)
-{
- /*
- * Make sure the z3fold header is not larger than the page size and
- * there has remaining spaces for its buddy.
- */
- BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
- zpool_register_driver(&z3fold_zpool_driver);
-
- return 0;
-}
-
-static void __exit exit_z3fold(void)
-{
- zpool_unregister_driver(&z3fold_zpool_driver);
-}
-
-module_init(init_z3fold);
-module_exit(exit_z3fold);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
-MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
diff --git a/mm/zbud.c b/mm/zbud.c
deleted file mode 100644
index e9836fff9438..000000000000
--- a/mm/zbud.c
+++ /dev/null
@@ -1,455 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * zbud.c
- *
- * Copyright (C) 2013, Seth Jennings, IBM
- *
- * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
- *
- * zbud is an special purpose allocator for storing compressed pages. Contrary
- * to what its name may suggest, zbud is not a buddy allocator, but rather an
- * allocator that "buddies" two compressed pages together in a single memory
- * page.
- *
- * While this design limits storage density, it has simple and deterministic
- * reclaim properties that make it preferable to a higher density approach when
- * reclaim will be used.
- *
- * zbud works by storing compressed pages, or "zpages", together in pairs in a
- * single memory page called a "zbud page". The first buddy is "left
- * justified" at the beginning of the zbud page, and the last buddy is "right
- * justified" at the end of the zbud page. The benefit is that if either
- * buddy is freed, the freed buddy space, coalesced with whatever slack space
- * that existed between the buddies, results in the largest possible free region
- * within the zbud page.
- *
- * zbud also provides an attractive lower bound on density. The ratio of zpages
- * to zbud pages can not be less than 1. This ensures that zbud can never "do
- * harm" by using more pages to store zpages than the uncompressed zpages would
- * have used on their own.
- *
- * zbud pages are divided into "chunks". The size of the chunks is fixed at
- * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
- * into chunks allows organizing unbuddied zbud pages into a manageable number
- * of unbuddied lists according to the number of free chunks available in the
- * zbud page.
- *
- * The zbud API differs from that of conventional allocators in that the
- * allocation function, zbud_alloc(), returns an opaque handle to the user,
- * not a dereferenceable pointer. The user must map the handle using
- * zbud_map() in order to get a usable pointer by which to access the
- * allocation data and unmap the handle with zbud_unmap() when operations
- * on the allocation data are complete.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/atomic.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/zpool.h>
-
-/*****************
- * Structures
-*****************/
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
- * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
- * 63 which shows the max number of free chunks in zbud page, also there will be
- * 63 freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-struct zbud_pool;
-
-/**
- * struct zbud_pool - stores metadata for each zbud pool
- * @lock: protects all pool fields and first|last_chunk fields of any
- * zbud page in the pool
- * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
- * the lists each zbud page is added to depends on the size of
- * its free region.
- * @buddied: list tracking the zbud pages that contain two buddies;
- * these zbud pages are full
- * @pages_nr: number of zbud pages in the pool.
- *
- * This structure is allocated at pool creation time and maintains metadata
- * pertaining to a particular zbud pool.
- */
-struct zbud_pool {
- spinlock_t lock;
- union {
- /*
- * Reuse unbuddied[0] as buddied on the ground that
- * unbuddied[0] is unused.
- */
- struct list_head buddied;
- struct list_head unbuddied[NCHUNKS];
- };
- u64 pages_nr;
-};
-
-/*
- * struct zbud_header - zbud page metadata occupying the first chunk of each
- * zbud page.
- * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- */
-struct zbud_header {
- struct list_head buddy;
- unsigned int first_chunks;
- unsigned int last_chunks;
-};
-
-/*****************
- * Helpers
-*****************/
-/* Just to make the code easier to read */
-enum buddy {
- FIRST,
- LAST
-};
-
-/* Converts an allocation size in bytes to size in zbud chunks */
-static int size_to_chunks(size_t size)
-{
- return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-#define for_each_unbuddied_list(_iter, _begin) \
- for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
-
-/* Initializes the zbud header of a newly allocated zbud page */
-static struct zbud_header *init_zbud_page(struct page *page)
-{
- struct zbud_header *zhdr = page_address(page);
- zhdr->first_chunks = 0;
- zhdr->last_chunks = 0;
- INIT_LIST_HEAD(&zhdr->buddy);
- return zhdr;
-}
-
-/* Resets the struct page fields and frees the page */
-static void free_zbud_page(struct zbud_header *zhdr)
-{
- __free_page(virt_to_page(zhdr));
-}
-
-/*
- * Encodes the handle of a particular buddy within a zbud page
- * Pool lock should be held as this function accesses first|last_chunks
- */
-static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
-{
- unsigned long handle;
-
- /*
- * For now, the encoded handle is actually just the pointer to the data
- * but this might not always be the case. A little information hiding.
- * Add CHUNK_SIZE to the handle if it is the first allocation to jump
- * over the zbud header in the first chunk.
- */
- handle = (unsigned long)zhdr;
- if (bud == FIRST)
- /* skip over zbud header */
- handle += ZHDR_SIZE_ALIGNED;
- else /* bud == LAST */
- handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
- return handle;
-}
-
-/* Returns the zbud page where a given handle is stored */
-static struct zbud_header *handle_to_zbud_header(unsigned long handle)
-{
- return (struct zbud_header *)(handle & PAGE_MASK);
-}
-
-/* Returns the number of free chunks in a zbud page */
-static int num_free_chunks(struct zbud_header *zhdr)
-{
- /*
- * Rather than branch for different situations, just use the fact that
- * free buddies have a length of zero to simplify everything.
- */
- return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
-}
-
-/*****************
- * API Functions
-*****************/
-/**
- * zbud_create_pool() - create a new zbud pool
- * @gfp: gfp flags when allocating the zbud pool structure
- *
- * Return: pointer to the new zbud pool or NULL if the metadata allocation
- * failed.
- */
-static struct zbud_pool *zbud_create_pool(gfp_t gfp)
-{
- struct zbud_pool *pool;
- int i;
-
- pool = kzalloc(sizeof(struct zbud_pool), gfp);
- if (!pool)
- return NULL;
- spin_lock_init(&pool->lock);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&pool->unbuddied[i]);
- INIT_LIST_HEAD(&pool->buddied);
- pool->pages_nr = 0;
- return pool;
-}
-
-/**
- * zbud_destroy_pool() - destroys an existing zbud pool
- * @pool: the zbud pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void zbud_destroy_pool(struct zbud_pool *pool)
-{
- kfree(pool);
-}
-
-/**
- * zbud_alloc() - allocates a region of a given size
- * @pool: zbud pool from which to allocate
- * @size: size in bytes of the desired allocation
- * @gfp: gfp flags used if the pool needs to grow
- * @handle: handle of the new allocation
- *
- * This function will attempt to find a free region in the pool large enough to
- * satisfy the allocation request. A search of the unbuddied lists is
- * performed first. If no suitable free region is found, then a new page is
- * allocated and added to the pool to satisfy the request.
- *
- * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
- * as zbud pool pages.
- *
- * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
- * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
- * a new page.
- */
-static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- int chunks, i, freechunks;
- struct zbud_header *zhdr = NULL;
- enum buddy bud;
- struct page *page;
-
- if (!size || (gfp & __GFP_HIGHMEM))
- return -EINVAL;
- if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
- return -ENOSPC;
- chunks = size_to_chunks(size);
- spin_lock(&pool->lock);
-
- /* First, try to find an unbuddied zbud page. */
- for_each_unbuddied_list(i, chunks) {
- if (!list_empty(&pool->unbuddied[i])) {
- zhdr = list_first_entry(&pool->unbuddied[i],
- struct zbud_header, buddy);
- list_del(&zhdr->buddy);
- if (zhdr->first_chunks == 0)
- bud = FIRST;
- else
- bud = LAST;
- goto found;
- }
- }
-
- /* Couldn't find unbuddied zbud page, create new one */
- spin_unlock(&pool->lock);
- page = alloc_page(gfp);
- if (!page)
- return -ENOMEM;
- spin_lock(&pool->lock);
- pool->pages_nr++;
- zhdr = init_zbud_page(page);
- bud = FIRST;
-
-found:
- if (bud == FIRST)
- zhdr->first_chunks = chunks;
- else
- zhdr->last_chunks = chunks;
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* Add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
- }
-
- *handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
-
- return 0;
-}
-
-/**
- * zbud_free() - frees the allocation associated with the given handle
- * @pool: pool in which the allocation resided
- * @handle: handle associated with the allocation returned by zbud_alloc()
- */
-static void zbud_free(struct zbud_pool *pool, unsigned long handle)
-{
- struct zbud_header *zhdr;
- int freechunks;
-
- spin_lock(&pool->lock);
- zhdr = handle_to_zbud_header(handle);
-
- /* If first buddy, handle will be page aligned */
- if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
- zhdr->last_chunks = 0;
- else
- zhdr->first_chunks = 0;
-
- /* Remove from existing buddy list */
- list_del(&zhdr->buddy);
-
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /* zbud page is empty, free */
- free_zbud_page(zhdr);
- pool->pages_nr--;
- } else {
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- }
-
- spin_unlock(&pool->lock);
-}
-
-/**
- * zbud_map() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be mapped
- *
- * While trivial for zbud, the mapping functions for others allocators
- * implementing this allocation API could have more complex information encoded
- * in the handle and could create temporary mappings to make the data
- * accessible to the user.
- *
- * Returns: a pointer to the mapped allocation
- */
-static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
-{
- return (void *)(handle);
-}
-
-/**
- * zbud_unmap() - maps the allocation associated with the given handle
- * @pool: pool in which the allocation resides
- * @handle: handle associated with the allocation to be unmapped
- */
-static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
-{
-}
-
-/**
- * zbud_get_pool_pages() - gets the zbud pool size in pages
- * @pool: pool whose size is being queried
- *
- * Returns: size in pages of the given pool. The pool lock need not be
- * taken to access pages_nr.
- */
-static u64 zbud_get_pool_pages(struct zbud_pool *pool)
-{
- return pool->pages_nr;
-}
-
-/*****************
- * zpool
- ****************/
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp)
-{
- return zbud_create_pool(gfp);
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
- zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
- zbud_free(pool, handle);
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
- zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_pages(void *pool)
-{
- return zbud_get_pool_pages(pool);
-}
-
-static struct zpool_driver zbud_zpool_driver = {
- .type = "zbud",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = zbud_zpool_create,
- .destroy = zbud_zpool_destroy,
- .malloc = zbud_zpool_malloc,
- .free = zbud_zpool_free,
- .map = zbud_zpool_map,
- .unmap = zbud_zpool_unmap,
- .total_pages = zbud_zpool_total_pages,
-};
-
-MODULE_ALIAS("zpool-zbud");
-
-static int __init init_zbud(void)
-{
- /* Make sure the zbud header will fit in one chunk */
- BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
- pr_info("loaded\n");
-
- zpool_register_driver(&zbud_zpool_driver);
-
- return 0;
-}
-
-static void __exit exit_zbud(void)
-{
- zpool_unregister_driver(&zbud_zpool_driver);
- pr_info("unloaded\n");
-}
-
-module_init(init_zbud);
-module_exit(exit_zbud);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
-MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zpool.c b/mm/zpool.c
index b9fda1fa857d..6d6d88930932 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -95,7 +95,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_has_pool() - Check if the pool driver is available
- * @type: The type of the zpool to check (e.g. zbud, zsmalloc)
+ * @type: The type of the zpool to check (e.g. zsmalloc)
*
* This checks if the @type pool driver is available. This will try to load
* the requested module, if needed, but there is no guarantee the module will
@@ -130,7 +130,7 @@ EXPORT_SYMBOL(zpool_has_pool);
/**
* zpool_create_pool() - Create a new zpool
- * @type: The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @type: The type of the zpool to create (e.g. zsmalloc)
* @name: The name of the zpool (e.g. zram0, zswap)
* @gfp: The GFP flags to use when allocating the pool.
*
@@ -221,22 +221,6 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
- * zpool_malloc_support_movable() - Check if the zpool supports
- * allocating movable memory
- * @zpool: The zpool to check
- *
- * This returns if the zpool supports allocating movable memory.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: true if the zpool supports allocating movable memory, false if not
- */
-bool zpool_malloc_support_movable(struct zpool *zpool)
-{
- return zpool->driver->malloc_support_movable;
-}
-
-/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
* @size: The amount of memory to allocate.
@@ -278,46 +262,51 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_map_handle() - Map a previously allocated handle into memory
+ * zpool_obj_read_begin() - Start reading from a previously allocated handle.
* @zpool: The zpool that the handle was allocated from
- * @handle: The handle to map
- * @mapmode: How the memory should be mapped
+ * @handle: The handle to read from
+ * @local_copy: A local buffer to use if needed.
*
- * This maps a previously allocated handle into memory. The @mapmode
- * param indicates to the implementation how the memory will be
- * used, i.e. read-only, write-only, read-write. If the
- * implementation does not support it, the memory will be treated
- * as read-write.
+ * This starts a read operation of a previously allocated handle. The passed
+ * @local_copy buffer may be used if needed by copying the memory into.
+ * zpool_obj_read_end() MUST be called after the read is completed to undo any
+ * actions taken (e.g. release locks).
*
- * This may hold locks, disable interrupts, and/or preemption,
- * and the zpool_unmap_handle() must be called to undo those
- * actions. The code that uses the mapped handle should complete
- * its operations on the mapped handle memory quickly and unmap
- * as soon as possible. As the implementation may use per-cpu
- * data, multiple handles should not be mapped concurrently on
- * any cpu.
+ * Returns: A pointer to the handle memory to be read, if @local_copy is used,
+ * the returned pointer is @local_copy.
+ */
+void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle,
+ void *local_copy)
+{
+ return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy);
+}
+
+/**
+ * zpool_obj_read_end() - Finish reading from a previously allocated handle.
+ * @zpool: The zpool that the handle was allocated from
+ * @handle: The handle to read from
+ * @handle_mem: The pointer returned by zpool_obj_read_begin()
*
- * Returns: A pointer to the handle's mapped memory area.
+ * Finishes a read operation previously started by zpool_obj_read_begin().
*/
-void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
- enum zpool_mapmode mapmode)
+void zpool_obj_read_end(struct zpool *zpool, unsigned long handle,
+ void *handle_mem)
{
- return zpool->driver->map(zpool->pool, handle, mapmode);
+ zpool->driver->obj_read_end(zpool->pool, handle, handle_mem);
}
/**
- * zpool_unmap_handle() - Unmap a previously mapped handle
+ * zpool_obj_write() - Write to a previously allocated handle.
* @zpool: The zpool that the handle was allocated from
- * @handle: The handle to unmap
+ * @handle: The handle to read from
+ * @handle_mem: The memory to copy from into the handle.
+ * @mem_len: The length of memory to be written.
*
- * This unmaps a previously mapped handle. Any locks or other
- * actions that the implementation took in zpool_map_handle()
- * will be undone here. The memory area returned from
- * zpool_map_handle() should no longer be used after this.
*/
-void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
+void zpool_obj_write(struct zpool *zpool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
{
- zpool->driver->unmap(zpool->pool, handle);
+ zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len);
}
/**
@@ -333,23 +322,5 @@ u64 zpool_get_total_pages(struct zpool *zpool)
return zpool->driver->total_pages(zpool->pool);
}
-/**
- * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
- * @zpool: The zpool to test
- *
- * Some allocators enter non-preemptible context in ->map() callback (e.g.
- * disable pagefaults) and exit that context in ->unmap(), which limits what
- * we can do with the mapped object. For instance, we cannot wait for
- * asynchronous crypto API to decompress such an object or take mutexes
- * since those will call into the scheduler. This function tells us whether
- * we use such an allocator.
- *
- * Returns: true if zpool can sleep; false otherwise.
- */
-bool zpool_can_sleep_mapped(struct zpool *zpool)
-{
- return zpool->driver->sleep_mapped;
-}
-
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 6d0e47f7ae33..961b270f023c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -18,7 +18,7 @@
/*
* lock ordering:
* page_lock
- * pool->migrate_lock
+ * pool->lock
* class->lock
* zspage->lock
*/
@@ -223,8 +223,8 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct work_struct free_work;
#endif
- /* protect page/zspage migration */
- rwlock_t migrate_lock;
+ /* protect zspage migration/compaction */
+ rwlock_t lock;
atomic_t compaction_in_progress;
};
@@ -257,6 +257,15 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
__free_page(page);
}
+#define ZS_PAGE_UNLOCKED 0
+#define ZS_PAGE_WRLOCKED -1
+
+struct zspage_lock {
+ spinlock_t lock;
+ int cnt;
+ struct lockdep_map dep_map;
+};
+
struct zspage {
struct {
unsigned int huge:HUGE_BITS;
@@ -269,15 +278,86 @@ struct zspage {
struct zpdesc *first_zpdesc;
struct list_head list; /* fullness list */
struct zs_pool *pool;
- rwlock_t lock;
+ struct zspage_lock zsl;
};
-struct mapping_area {
- local_lock_t lock;
- char *vm_buf; /* copy buffer for objects that span pages */
- char *vm_addr; /* address of kmap_local_page()'ed pages */
- enum zs_mapmode vm_mm; /* mapping mode */
-};
+static void zspage_lock_init(struct zspage *zspage)
+{
+ static struct lock_class_key __key;
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0);
+ spin_lock_init(&zsl->lock);
+ zsl->cnt = ZS_PAGE_UNLOCKED;
+}
+
+/*
+ * The zspage lock can be held from atomic contexts, but it needs to remain
+ * preemptible when held for reading because it remains held outside of those
+ * atomic contexts, otherwise we unnecessarily lose preemptibility.
+ *
+ * To achieve this, the following rules are enforced on readers and writers:
+ *
+ * - Writers are blocked by both writers and readers, while readers are only
+ * blocked by writers (i.e. normal rwlock semantics).
+ *
+ * - Writers are always atomic (to allow readers to spin waiting for them).
+ *
+ * - Writers always use trylock (as the lock may be held be sleeping readers).
+ *
+ * - Readers may spin on the lock (as they can only wait for atomic writers).
+ *
+ * - Readers may sleep while holding the lock (as writes only use trylock).
+ */
+static void zspage_read_lock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_);
+
+ spin_lock(&zsl->lock);
+ zsl->cnt++;
+ spin_unlock(&zsl->lock);
+
+ lock_acquired(&zsl->dep_map, _RET_IP_);
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_release(&zsl->dep_map, _RET_IP_);
+
+ spin_lock(&zsl->lock);
+ zsl->cnt--;
+ spin_unlock(&zsl->lock);
+}
+
+static __must_check bool zspage_write_trylock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ spin_lock(&zsl->lock);
+ if (zsl->cnt == ZS_PAGE_UNLOCKED) {
+ zsl->cnt = ZS_PAGE_WRLOCKED;
+ rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_);
+ lock_acquired(&zsl->dep_map, _RET_IP_);
+ return true;
+ }
+
+ spin_unlock(&zsl->lock);
+ return false;
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+ struct zspage_lock *zsl = &zspage->zsl;
+
+ rwsem_release(&zsl->dep_map, _RET_IP_);
+
+ zsl->cnt = ZS_PAGE_UNLOCKED;
+ spin_unlock(&zsl->lock);
+}
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
static void SetZsHugePage(struct zspage *zspage)
@@ -290,12 +370,6 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-static void migrate_lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
-
#ifdef CONFIG_COMPACTION
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
@@ -401,29 +475,22 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static void *zs_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
+static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle,
+ void *local_copy)
{
- enum zs_mapmode zs_mm;
-
- switch (mm) {
- case ZPOOL_MM_RO:
- zs_mm = ZS_MM_RO;
- break;
- case ZPOOL_MM_WO:
- zs_mm = ZS_MM_WO;
- break;
- case ZPOOL_MM_RW:
- default:
- zs_mm = ZS_MM_RW;
- break;
- }
+ return zs_obj_read_begin(pool, handle, local_copy);
+}
- return zs_map_object(pool, handle, zs_mm);
+static void zs_zpool_obj_read_end(void *pool, unsigned long handle,
+ void *handle_mem)
+{
+ zs_obj_read_end(pool, handle, handle_mem);
}
-static void zs_zpool_unmap(void *pool, unsigned long handle)
+
+static void zs_zpool_obj_write(void *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
{
- zs_unmap_object(pool, handle);
+ zs_obj_write(pool, handle, handle_mem, mem_len);
}
static u64 zs_zpool_total_pages(void *pool)
@@ -436,22 +503,17 @@ static struct zpool_driver zs_zpool_driver = {
.owner = THIS_MODULE,
.create = zs_zpool_create,
.destroy = zs_zpool_destroy,
- .malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
+ .obj_read_begin = zs_zpool_obj_read_begin,
+ .obj_read_end = zs_zpool_obj_read_end,
+ .obj_write = zs_zpool_obj_write,
.total_pages = zs_zpool_total_pages,
};
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
-/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
-
static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
{
return PagePrivate(zpdesc_page(zpdesc));
@@ -992,7 +1054,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
return NULL;
zspage->magic = ZSPAGE_MAGIC;
- migrate_lock_init(zspage);
+ zspage->pool = pool;
+ zspage->class = class->index;
+ zspage_lock_init(zspage);
for (i = 0; i < class->pages_per_zspage; i++) {
struct zpdesc *zpdesc;
@@ -1015,8 +1079,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
create_page_chain(class, zspage, zpdescs);
init_zspage(class, zspage);
- zspage->pool = pool;
- zspage->class = class->index;
return zspage;
}
@@ -1036,93 +1098,6 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
- /*
- * Make sure we don't leak memory if a cpu UP notification
- * and zs_init() race and both call zs_cpu_up() on the same cpu
- */
- if (area->vm_buf)
- return 0;
- area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
- if (!area->vm_buf)
- return -ENOMEM;
- return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
- kfree(area->vm_buf);
- area->vm_buf = NULL;
-}
-
-static void *__zs_map_object(struct mapping_area *area,
- struct zpdesc *zpdescs[2], int off, int size)
-{
- size_t sizes[2];
- char *buf = area->vm_buf;
-
- /* disable page faults to match kmap_local_page() return conditions */
- pagefault_disable();
-
- /* no read fastpath */
- if (area->vm_mm == ZS_MM_WO)
- goto out;
-
- sizes[0] = PAGE_SIZE - off;
- sizes[1] = size - sizes[0];
-
- /* copy object to per-cpu buffer */
- memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]);
- memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]);
-out:
- return area->vm_buf;
-}
-
-static void __zs_unmap_object(struct mapping_area *area,
- struct zpdesc *zpdescs[2], int off, int size)
-{
- size_t sizes[2];
- char *buf;
-
- /* no write fastpath */
- if (area->vm_mm == ZS_MM_RO)
- goto out;
-
- buf = area->vm_buf;
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
-
- sizes[0] = PAGE_SIZE - off;
- sizes[1] = size - sizes[0];
-
- /* copy per-cpu buffer to object */
- memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]);
- memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]);
-
-out:
- /* enable page faults to match kunmap_local() return conditions */
- pagefault_enable();
-}
-
-static int zs_cpu_prepare(unsigned int cpu)
-{
- struct mapping_area *area;
-
- area = &per_cpu(zs_map_area, cpu);
- return __zs_cpu_up(area);
-}
-
-static int zs_cpu_dead(unsigned int cpu)
-{
- struct mapping_area *area;
-
- area = &per_cpu(zs_map_area, cpu);
- __zs_cpu_down(area);
- return 0;
-}
-
static bool can_merge(struct size_class *prev, int pages_per_zspage,
int objs_per_zspage)
{
@@ -1170,92 +1145,64 @@ unsigned long zs_get_total_pages(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_get_total_pages);
-/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- * @mm: mapping mode to use
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
- *
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
- *
- * This function returns with preemption and page faults disabled.
- */
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
- enum zs_mapmode mm)
+void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
+ void *local_copy)
{
struct zspage *zspage;
struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
-
struct size_class *class;
- struct mapping_area *area;
- struct zpdesc *zpdescs[2];
- void *ret;
+ void *addr;
- /*
- * Because we use per-cpu mapping areas shared among the
- * pools/users, we can't allow mapping in interrupt context
- * because it can corrupt another users mappings.
- */
- BUG_ON(in_interrupt());
-
- /* It guarantees it can get zspage from handle safely */
- read_lock(&pool->migrate_lock);
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &zpdesc, &obj_idx);
zspage = get_zspage(zpdesc);
- /*
- * migration cannot move any zpages in this zspage. Here, class->lock
- * is too heavy since callers would take some time until they calls
- * zs_unmap_object API so delegate the locking from class to zspage
- * which is smaller granularity.
- */
- migrate_read_lock(zspage);
- read_unlock(&pool->migrate_lock);
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- local_lock(&zs_map_area.lock);
- area = this_cpu_ptr(&zs_map_area);
- area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
- area->vm_addr = kmap_local_zpdesc(zpdesc);
- ret = area->vm_addr + off;
- goto out;
+ addr = kmap_local_zpdesc(zpdesc);
+ addr += off;
+ } else {
+ size_t sizes[2];
+
+ /* this object spans two pages */
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = class->size - sizes[0];
+ addr = local_copy;
+
+ memcpy_from_page(addr, zpdesc_page(zpdesc),
+ off, sizes[0]);
+ zpdesc = get_next_zpdesc(zpdesc);
+ memcpy_from_page(addr + sizes[0],
+ zpdesc_page(zpdesc),
+ 0, sizes[1]);
}
- /* this object spans two pages */
- zpdescs[0] = zpdesc;
- zpdescs[1] = get_next_zpdesc(zpdesc);
- BUG_ON(!zpdescs[1]);
-
- ret = __zs_map_object(area, zpdescs, off, class->size);
-out:
- if (likely(!ZsHugePage(zspage)))
- ret += ZS_HANDLE_SIZE;
+ if (!ZsHugePage(zspage))
+ addr += ZS_HANDLE_SIZE;
- return ret;
+ return addr;
}
-EXPORT_SYMBOL_GPL(zs_map_object);
+EXPORT_SYMBOL_GPL(zs_obj_read_begin);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem)
{
struct zspage *zspage;
struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
-
struct size_class *class;
- struct mapping_area *area;
obj = handle_to_obj(handle);
obj_to_location(obj, &zpdesc, &obj_idx);
@@ -1263,23 +1210,65 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- area = this_cpu_ptr(&zs_map_area);
- if (off + class->size <= PAGE_SIZE)
- kunmap_local(area->vm_addr);
- else {
- struct zpdesc *zpdescs[2];
+ if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+ handle_mem -= off;
+ kunmap_local(handle_mem);
+ }
+
+ zspage_read_unlock(zspage);
+}
+EXPORT_SYMBOL_GPL(zs_obj_read_end);
+
+void zs_obj_write(struct zs_pool *pool, unsigned long handle,
+ void *handle_mem, size_t mem_len)
+{
+ struct zspage *zspage;
+ struct zpdesc *zpdesc;
+ unsigned long obj, off;
+ unsigned int obj_idx;
+ struct size_class *class;
- zpdescs[0] = zpdesc;
- zpdescs[1] = get_next_zpdesc(zpdesc);
- BUG_ON(!zpdescs[1]);
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
- __zs_unmap_object(area, zpdescs, off, class->size);
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
+
+ class = zspage_class(pool, zspage);
+ off = offset_in_page(class->size * obj_idx);
+
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ void *dst = kmap_local_zpdesc(zpdesc);
+
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+ memcpy(dst + off, handle_mem, mem_len);
+ kunmap_local(dst);
+ } else {
+ /* this object spans two pages */
+ size_t sizes[2];
+
+ off += ZS_HANDLE_SIZE;
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = mem_len - sizes[0];
+
+ memcpy_to_page(zpdesc_page(zpdesc), off,
+ handle_mem, sizes[0]);
+ zpdesc = get_next_zpdesc(zpdesc);
+ memcpy_to_page(zpdesc_page(zpdesc), 0,
+ handle_mem + sizes[0], sizes[1]);
}
- local_unlock(&zs_map_area.lock);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
+EXPORT_SYMBOL_GPL(zs_obj_write);
/**
* zs_huge_class_size() - Returns the size (in bytes) of the first huge
@@ -1450,16 +1439,16 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
return;
/*
- * The pool->migrate_lock protects the race with zpage's migration
+ * The pool->lock protects the race with zpage's migration
* so it's safe to get the page from handle.
*/
- read_lock(&pool->migrate_lock);
+ read_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_zpdesc(obj, &f_zpdesc);
zspage = get_zspage(f_zpdesc);
class = zspage_class(pool, zspage);
spin_lock(&class->lock);
- read_unlock(&pool->migrate_lock);
+ read_unlock(&pool->lock);
class_stat_sub(class, ZS_OBJS_INUSE, 1);
obj_free(class->size, obj);
@@ -1671,18 +1660,18 @@ static void lock_zspage(struct zspage *zspage)
/*
* Pages we haven't locked yet can be migrated off the list while we're
* trying to lock them, so we need to be careful and only attempt to
- * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * lock each page under zspage_read_lock(). Otherwise, the page we lock
* may no longer belong to the zspage. This means that we may wait for
* the wrong page to unlock, so we must take a reference to the page
- * prior to waiting for it to unlock outside migrate_read_lock().
+ * prior to waiting for it to unlock outside zspage_read_lock().
*/
while (1) {
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
zpdesc = get_first_zpdesc(zspage);
if (zpdesc_trylock(zpdesc))
break;
zpdesc_get(zpdesc);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
}
@@ -1693,41 +1682,16 @@ static void lock_zspage(struct zspage *zspage)
curr_zpdesc = zpdesc;
} else {
zpdesc_get(zpdesc);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
}
}
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
#endif /* CONFIG_COMPACTION */
-static void migrate_lock_init(struct zspage *zspage)
-{
- rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
- read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
- read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
- write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
- write_unlock(&zspage->lock);
-}
-
#ifdef CONFIG_COMPACTION
static const struct movable_operations zsmalloc_mops;
@@ -1785,9 +1749,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
- /* We're committed, tell the world that this is a Zsmalloc page. */
- __zpdesc_set_zsmalloc(newzpdesc);
-
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
pool = zspage->pool;
@@ -1796,15 +1757,22 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* The pool migrate_lock protects the race between zpage migration
* and zs_free.
*/
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
class = zspage_class(pool, zspage);
/*
* the class lock protects zpage alloc/free in the zspage.
*/
spin_lock(&class->lock);
- /* the migrate_write_lock protects zpage access via zs_map_object */
- migrate_write_lock(zspage);
+ /* the zspage write_lock protects zpage access via zs_obj_read/write() */
+ if (!zspage_write_trylock(zspage)) {
+ spin_unlock(&class->lock);
+ write_unlock(&pool->lock);
+ return -EINVAL;
+ }
+
+ /* We're committed, tell the world that this is a Zsmalloc page. */
+ __zpdesc_set_zsmalloc(newzpdesc);
offset = get_first_obj_offset(zpdesc);
s_addr = kmap_local_zpdesc(zpdesc);
@@ -1833,9 +1801,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* Since we complete the data copy and set up new zspage structure,
* it's okay to release migration_lock.
*/
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
spin_unlock(&class->lock);
- migrate_write_unlock(zspage);
+ zspage_write_unlock(zspage);
zpdesc_get(newzpdesc);
if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
@@ -1956,7 +1924,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
* protect the race between zpage migration and zs_free
* as well as zpage allocation/free
*/
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
spin_lock(&class->lock);
while (zs_can_compact(class)) {
int fg;
@@ -1971,9 +1939,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!src_zspage)
break;
- migrate_write_lock(src_zspage);
+ if (!zspage_write_trylock(src_zspage))
+ break;
+
migrate_zspage(pool, src_zspage, dst_zspage);
- migrate_write_unlock(src_zspage);
+ zspage_write_unlock(src_zspage);
fg = putback_zspage(class, src_zspage);
if (fg == ZS_INUSE_RATIO_0) {
@@ -1983,14 +1953,14 @@ static unsigned long __zs_compact(struct zs_pool *pool,
src_zspage = NULL;
if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
- || rwlock_is_contended(&pool->migrate_lock)) {
+ || rwlock_is_contended(&pool->lock)) {
putback_zspage(class, dst_zspage);
dst_zspage = NULL;
spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
cond_resched();
- write_lock(&pool->migrate_lock);
+ write_lock(&pool->lock);
spin_lock(&class->lock);
}
}
@@ -2002,7 +1972,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
putback_zspage(class, dst_zspage);
spin_unlock(&class->lock);
- write_unlock(&pool->migrate_lock);
+ write_unlock(&pool->lock);
return pages_freed;
}
@@ -2014,10 +1984,10 @@ unsigned long zs_compact(struct zs_pool *pool)
unsigned long pages_freed = 0;
/*
- * Pool compaction is performed under pool->migrate_lock so it is basically
+ * Pool compaction is performed under pool->lock so it is basically
* single-threaded. Having more than one thread in __zs_compact()
- * will increase pool->migrate_lock contention, which will impact other
- * zsmalloc operations that need pool->migrate_lock.
+ * will increase pool->lock contention, which will impact other
+ * zsmalloc operations that need pool->lock.
*/
if (atomic_xchg(&pool->compaction_in_progress, 1))
return 0;
@@ -2139,7 +2109,7 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
- rwlock_init(&pool->migrate_lock);
+ rwlock_init(&pool->lock);
atomic_set(&pool->compaction_in_progress, 0);
pool->name = kstrdup(name, GFP_KERNEL);
@@ -2278,23 +2248,11 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
static int __init zs_init(void)
{
- int ret;
-
- ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
- zs_cpu_prepare, zs_cpu_dead);
- if (ret)
- goto out;
-
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
-
zs_stat_init();
-
return 0;
-
-out:
- return ret;
}
static void __exit zs_exit(void)
@@ -2302,8 +2260,6 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
- cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
-
zs_stat_exit();
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 6504174fbc6a..204fb59da33c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
static u64 zswap_reject_compress_fail;
/* Compressed page was too big for the allocator to (optimally) store */
static u64 zswap_reject_compress_poor;
+/* Load or writeback failed due to decompression failure */
+static u64 zswap_decompress_fail;
/* Store failed because underlying allocator could not get memory */
static u64 zswap_reject_alloc_fail;
/* Store failed because the entry metadata could not be allocated (rare) */
@@ -881,18 +883,32 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct acomp_req *req;
+ struct crypto_acomp *acomp;
+ u8 *buffer;
+
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return 0;
mutex_lock(&acomp_ctx->mutex);
- if (!IS_ERR_OR_NULL(acomp_ctx)) {
- if (!IS_ERR_OR_NULL(acomp_ctx->req))
- acomp_request_free(acomp_ctx->req);
- acomp_ctx->req = NULL;
- if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
- crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
- }
+ req = acomp_ctx->req;
+ acomp = acomp_ctx->acomp;
+ buffer = acomp_ctx->buffer;
+ acomp_ctx->req = NULL;
+ acomp_ctx->acomp = NULL;
+ acomp_ctx->buffer = NULL;
mutex_unlock(&acomp_ctx->mutex);
+ /*
+ * Do the actual freeing after releasing the mutex to avoid subtle
+ * locking dependencies causing deadlocks.
+ */
+ if (!IS_ERR_OR_NULL(req))
+ acomp_request_free(req);
+ if (!IS_ERR_OR_NULL(acomp))
+ crypto_free_acomp(acomp);
+ kfree(buffer);
+
return 0;
}
@@ -930,7 +946,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
unsigned int dlen = PAGE_SIZE;
unsigned long handle;
struct zpool *zpool;
- char *buf;
gfp_t gfp;
u8 *dst;
@@ -965,17 +980,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
goto unlock;
zpool = pool->zpool;
- gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- if (zpool_malloc_support_movable(zpool))
- gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle);
if (alloc_ret)
goto unlock;
- buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, dst, dlen);
- zpool_unmap_handle(zpool, handle);
-
+ zpool_obj_write(zpool, handle, dst, dlen);
entry->handle = handle;
entry->length = dlen;
@@ -991,41 +1001,49 @@ unlock:
return comp_ret == 0 && alloc_ret == 0;
}
-static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
+static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
{
struct zpool *zpool = entry->pool->zpool;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src;
+ int decomp_ret, dlen;
+ u8 *src, *obj;
acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
- src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
+ obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer);
+
/*
- * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
- * to do crypto_acomp_decompress() which might sleep. In such cases, we must
- * resort to copying the buffer to a temporary one.
- * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer,
- * such as a kmap address of high memory or even ever a vmap address.
- * However, sg_init_one is only equipped to handle linearly mapped low memory.
- * In such cases, we also must copy the buffer to a temporary and lowmem one.
+ * zpool_obj_read_begin() might return a kmap address of highmem when
+ * acomp_ctx->buffer is not used. However, sg_init_one() does not
+ * handle highmem addresses, so copy the object to acomp_ctx->buffer.
*/
- if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) ||
- !virt_addr_valid(src)) {
- memcpy(acomp_ctx->buffer, src, entry->length);
+ if (virt_addr_valid(obj)) {
+ src = obj;
+ } else {
+ WARN_ON_ONCE(obj == acomp_ctx->buffer);
+ memcpy(acomp_ctx->buffer, obj, entry->length);
src = acomp_ctx->buffer;
- zpool_unmap_handle(zpool, entry->handle);
}
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
sg_set_folio(&output, folio, PAGE_SIZE, 0);
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
- BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
- BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
+ decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
- if (src != acomp_ctx->buffer)
- zpool_unmap_handle(zpool, entry->handle);
+ zpool_obj_read_end(zpool, entry->handle, obj);
acomp_ctx_put_unlock(acomp_ctx);
+
+ if (!decomp_ret && dlen == PAGE_SIZE)
+ return true;
+
+ zswap_decompress_fail++;
+ pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n",
+ swp_type(entry->swpentry),
+ swp_offset(entry->swpentry),
+ entry->pool->tfm_name, entry->length, dlen);
+ return false;
}
/*********************************
@@ -1051,14 +1069,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
+ struct swap_info_struct *si;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
+ int ret = 0;
/* try to allocate swap cache folio */
+ si = get_swap_device(swpentry);
+ if (!si)
+ return -EEXIST;
+
mpol = get_task_policy(current);
folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ put_swap_device(si);
if (!folio)
return -ENOMEM;
@@ -1070,8 +1095,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* and freed when invalidated by the concurrent shrinker anyway.
*/
if (!folio_was_allocated) {
- folio_put(folio);
- return -EEXIST;
+ ret = -EEXIST;
+ goto out;
}
/*
@@ -1084,14 +1109,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* be dereferenced.
*/
tree = swap_zswap_tree(swpentry);
- if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
- delete_from_swap_cache(folio);
- folio_unlock(folio);
- folio_put(folio);
- return -ENOMEM;
+ if (entry != xa_load(tree, offset)) {
+ ret = -ENOMEM;
+ goto out;
}
- zswap_decompress(entry, folio);
+ if (!zswap_decompress(entry, folio)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ xa_erase(tree, offset);
count_vm_event(ZSWPWB);
if (entry->objcg)
@@ -1107,9 +1135,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/* start writeback */
__swap_writepage(folio, &wbc);
- folio_put(folio);
- return 0;
+out:
+ if (ret && ret != -EEXIST) {
+ delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+ return ret;
}
/*********************************
@@ -1445,9 +1478,9 @@ resched:
* main API
**********************************/
-static ssize_t zswap_store_page(struct page *page,
- struct obj_cgroup *objcg,
- struct zswap_pool *pool)
+static bool zswap_store_page(struct page *page,
+ struct obj_cgroup *objcg,
+ struct zswap_pool *pool)
{
swp_entry_t page_swpentry = page_swap_entry(page);
struct zswap_entry *entry, *old;
@@ -1456,7 +1489,7 @@ static ssize_t zswap_store_page(struct page *page,
entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
- return -EINVAL;
+ return false;
}
if (!zswap_compress(page, entry, pool))
@@ -1483,13 +1516,17 @@ static ssize_t zswap_store_page(struct page *page,
/*
* The entry is successfully compressed and stored in the tree, there is
- * no further possibility of failure. Grab refs to the pool and objcg.
- * These refs will be dropped by zswap_entry_free() when the entry is
- * removed from the tree.
+ * no further possibility of failure. Grab refs to the pool and objcg,
+ * charge zswap memory, and increment zswap_stored_pages.
+ * The opposite actions will be performed by zswap_entry_free()
+ * when the entry is removed from the tree.
*/
zswap_pool_get(pool);
- if (objcg)
+ if (objcg) {
obj_cgroup_get(objcg);
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ }
+ atomic_long_inc(&zswap_stored_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1510,13 +1547,13 @@ static ssize_t zswap_store_page(struct page *page,
zswap_lru_add(&zswap_list_lru, entry);
}
- return entry->length;
+ return true;
store_failed:
zpool_free(pool->zpool, entry->handle);
compress_failed:
zswap_entry_cache_free(entry);
- return -EINVAL;
+ return false;
}
bool zswap_store(struct folio *folio)
@@ -1526,7 +1563,6 @@ bool zswap_store(struct folio *folio)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
struct zswap_pool *pool;
- size_t compressed_bytes = 0;
bool ret = false;
long index;
@@ -1564,20 +1600,14 @@ bool zswap_store(struct folio *folio)
for (index = 0; index < nr_pages; ++index) {
struct page *page = folio_page(folio, index);
- ssize_t bytes;
- bytes = zswap_store_page(page, objcg, pool);
- if (bytes < 0)
+ if (!zswap_store_page(page, objcg, pool))
goto put_pool;
- compressed_bytes += bytes;
}
- if (objcg) {
- obj_cgroup_charge_zswap(objcg, compressed_bytes);
+ if (objcg)
count_objcg_events(objcg, ZSWPOUT, nr_pages);
- }
- atomic_long_add(nr_pages, &zswap_stored_pages);
count_vm_events(ZSWPOUT, nr_pages);
ret = true;
@@ -1612,7 +1642,27 @@ check_old:
return ret;
}
-bool zswap_load(struct folio *folio)
+/**
+ * zswap_load() - load a folio from zswap
+ * @folio: folio to load
+ *
+ * Return: 0 on success, with the folio unlocked and marked up-to-date, or one
+ * of the following error codes:
+ *
+ * -EIO: if the swapped out content was in zswap, but could not be loaded
+ * into the page due to a decompression failure. The folio is unlocked, but
+ * NOT marked up-to-date, so that an IO error is emitted (e.g. do_swap_page()
+ * will SIGBUS).
+ *
+ * -EINVAL: if the swapped out content was in zswap, but the page belongs
+ * to a large folio, which is not supported by zswap. The folio is unlocked,
+ * but NOT marked up-to-date, so that an IO error is emitted (e.g.
+ * do_swap_page() will SIGBUS).
+ *
+ * -ENOENT: if the swapped out content was not in zswap. The folio remains
+ * locked on return.
+ */
+int zswap_load(struct folio *folio)
{
swp_entry_t swp = folio->swap;
pgoff_t offset = swp_offset(swp);
@@ -1623,18 +1673,32 @@ bool zswap_load(struct folio *folio)
VM_WARN_ON_ONCE(!folio_test_locked(folio));
if (zswap_never_enabled())
- return false;
+ return -ENOENT;
/*
* Large folios should not be swapped in while zswap is being used, as
* they are not properly handled. Zswap does not properly load large
* folios, and a large folio may only be partially in zswap.
- *
- * Return true without marking the folio uptodate so that an IO error is
- * emitted (e.g. do_swap_page() will sigbus).
*/
- if (WARN_ON_ONCE(folio_test_large(folio)))
- return true;
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ folio_unlock(folio);
+ return -EINVAL;
+ }
+
+ entry = xa_load(tree, offset);
+ if (!entry)
+ return -ENOENT;
+
+ if (!zswap_decompress(entry, folio)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+
+ folio_mark_uptodate(folio);
+
+ count_vm_event(ZSWPIN);
+ if (entry->objcg)
+ count_objcg_events(entry->objcg, ZSWPIN, 1);
/*
* When reading into the swapcache, invalidate our entry. The
@@ -1648,27 +1712,14 @@ bool zswap_load(struct folio *folio)
* files, which reads into a private page and may free it if
* the fault fails. We remain the primary owner of the entry.)
*/
- if (swapcache)
- entry = xa_erase(tree, offset);
- else
- entry = xa_load(tree, offset);
-
- if (!entry)
- return false;
-
- zswap_decompress(entry, folio);
-
- count_vm_event(ZSWPIN);
- if (entry->objcg)
- count_objcg_events(entry->objcg, ZSWPIN, 1);
-
if (swapcache) {
- zswap_entry_free(entry);
folio_mark_dirty(folio);
+ xa_erase(tree, offset);
+ zswap_entry_free(entry);
}
- folio_mark_uptodate(folio);
- return true;
+ folio_unlock(folio);
+ return 0;
}
void zswap_invalidate(swp_entry_t swp)
@@ -1763,6 +1814,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_fail);
debugfs_create_u64("reject_compress_poor", 0444,
zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("decompress_fail", 0444,
+ zswap_debugfs_root, &zswap_decompress_fail);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
debugfs_create_file("pool_total_size", 0444,