summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig48
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c9
-rw-r--r--mm/balloon.c (renamed from mm/balloon_compaction.c)199
-rw-r--r--mm/cma.c125
-rw-r--r--mm/damon/core.c242
-rw-r--r--mm/damon/lru_sort.c120
-rw-r--r--mm/damon/paddr.c27
-rw-r--r--mm/damon/reclaim.c10
-rw-r--r--mm/damon/stat.c14
-rw-r--r--mm/damon/sysfs-schemes.c58
-rw-r--r--mm/damon/sysfs.c18
-rw-r--r--mm/damon/tests/core-kunit.h51
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c25
-rw-r--r--mm/debug_vm_pgtable.c38
-rw-r--r--mm/dmapool_test.c1
-rw-r--r--mm/early_ioremap.c16
-rw-r--r--mm/folio-compat.c1
-rw-r--r--mm/gup.c11
-rw-r--r--mm/gup_test.c1
-rw-r--r--mm/highmem.c3
-rw-r--r--mm/huge_memory.c8
-rw-r--r--mm/hugetlb.c180
-rw-r--r--mm/hugetlb_cgroup.c11
-rw-r--r--mm/hugetlb_cma.c71
-rw-r--r--mm/hugetlb_cma.h15
-rw-r--r--mm/hugetlb_sysctl.c11
-rw-r--r--mm/hugetlb_vmemmap.c11
-rw-r--r--mm/internal.h106
-rw-r--r--mm/kasan/kasan_test_c.c50
-rw-r--r--mm/kasan/report.c8
-rw-r--r--mm/kasan/shadow.c8
-rw-r--r--mm/kfence/core.c6
-rw-r--r--mm/khugepaged.c185
-rw-r--r--mm/kmsan/kmsan_test.c64
-rw-r--r--mm/list_lru.c6
-rw-r--r--mm/madvise.c26
-rw-r--r--mm/memblock.c4
-rw-r--r--mm/memcontrol-v1.c28
-rw-r--r--mm/memcontrol-v1.h6
-rw-r--r--mm/memcontrol.c99
-rw-r--r--mm/memfd.c3
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory-tiers.c7
-rw-r--r--mm/memory.c451
-rw-r--r--mm/memory_hotplug.c8
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/migrate_device.c12
-rw-r--r--mm/mm_init.c35
-rw-r--r--mm/mmap_lock.c177
-rw-r--r--mm/mmu_gather.c6
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c10
-rw-r--r--mm/mseal.c4
-rw-r--r--mm/numa_memblks.c2
-rw-r--r--mm/oom_kill.c26
-rw-r--r--mm/page-writeback.c74
-rw-r--r--mm/page_alloc.c425
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/page_isolation.c189
-rw-r--r--mm/page_reporting.c2
-rw-r--r--mm/page_table_check.c41
-rw-r--r--mm/pagewalk.c3
-rw-r--r--mm/percpu.c15
-rw-r--r--mm/pt_reclaim.c72
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c324
-rw-r--r--mm/shmem.c90
-rw-r--r--mm/show_mem.c3
-rw-r--r--mm/shrinker_debug.c13
-rw-r--r--mm/slub.c3
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap.h72
-rw-r--r--mm/swap_state.c342
-rw-r--r--mm/swapfile.c862
-rw-r--r--mm/tests/lazy_mmu_mode_kunit.c73
-rw-r--r--mm/userfaultfd.c18
-rw-r--r--mm/vma.c16
-rw-r--r--mm/vma.h14
-rw-r--r--mm/vmalloc.c26
-rw-r--r--mm/vmscan.c221
-rw-r--r--mm/vmstat.c30
-rw-r--r--mm/workingset.c8
-rw-r--r--mm/zsmalloc.c192
-rw-r--r--mm/zswap.c58
87 files changed, 3376 insertions, 2510 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fbac1dfc9943..ebd8ea353687 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -582,23 +582,20 @@ config SPLIT_PMD_PTLOCKS
#
# support for memory balloon
-config MEMORY_BALLOON
+config BALLOON
bool
#
-# support for memory balloon compaction
-config BALLOON_COMPACTION
- bool "Allow for balloon memory compaction/migration"
+# support for memory balloon page migration
+config BALLOON_MIGRATION
+ bool "Allow for balloon memory migration"
default y
- depends on COMPACTION && MEMORY_BALLOON
+ depends on MIGRATION && BALLOON
help
- Memory fragmentation introduced by ballooning might reduce
- significantly the number of 2MB contiguous memory blocks that can be
- used within a guest, thus imposing performance penalties associated
- with the reduced number of transparent huge pages that could be used
- by the guest workload. Allowing the compaction & migration for memory
- pages enlisted as being part of memory balloon devices avoids the
- scenario aforementioned and helps improving memory defragmentation.
+ Allow for migration of pages inflated in a memory balloon such that
+ they can be allocated from memory areas only available for movable
+ allocations (e.g., ZONE_MOVABLE, CMA) and such that they can be
+ migrated for memory defragmentation purposes by memory compaction.
#
# support for memory compaction
@@ -1440,14 +1437,12 @@ config ARCH_HAS_USER_SHADOW_STACK
The architecture has hardware support for userspace shadow call
stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
-config ARCH_SUPPORTS_PT_RECLAIM
+config HAVE_ARCH_TLB_REMOVE_TABLE
def_bool n
config PT_RECLAIM
- bool "reclaim empty user page table pages"
- default y
- depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP
- select MMU_GATHER_RCU_TABLE_FREE
+ def_bool y
+ depends on MMU_GATHER_RCU_TABLE_FREE && !HAVE_ARCH_TLB_REMOVE_TABLE
help
Try to reclaim empty user page table pages in paths other than munmap
and exit_mmap path.
@@ -1457,6 +1452,25 @@ config PT_RECLAIM
config FIND_NORMAL_PAGE
def_bool n
+config ARCH_HAS_LAZY_MMU_MODE
+ bool
+ help
+ The architecture uses the lazy MMU mode. This allows changes to
+ MMU-related architectural state to be deferred until the mode is
+ exited. See <linux/pgtable.h> for details.
+
+config LAZY_MMU_MODE_KUNIT_TEST
+ tristate "KUnit tests for the lazy MMU mode" if !KUNIT_ALL_TESTS
+ depends on ARCH_HAS_LAZY_MMU_MODE
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Enable this option to check that the lazy MMU mode interface behaves
+ as expected. Only tests for the generic interface are included (not
+ architecture-specific behaviours).
+
+ If unsure, say N.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index bf46fe31dc14..fd30164933a5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -125,7 +125,7 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_BALLOON) += balloon.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
@@ -149,4 +149,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
-obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
+obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c5740c6d37a2..e319bd5e8b75 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -939,7 +939,7 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
memcg_cgwb_list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
- queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
+ queue_work(system_dfl_wq, &cleanup_offline_cgwbs_work);
}
/**
@@ -971,10 +971,10 @@ static int __init cgwb_init(void)
{
/*
* There can be many concurrent release work items overwhelming
- * system_wq. Put them in a separate wq and limit concurrency.
+ * system_percpu_wq. Put them in a separate wq and limit concurrency.
* There's no point in executing many of these in parallel.
*/
- cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
+ cgwb_release_wq = alloc_workqueue("cgwb_release", WQ_PERCPU, 1);
if (!cgwb_release_wq)
return -ENOMEM;
@@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id)
bdi->capabilities = BDI_CAP_WRITEBACK;
bdi->ra_pages = VM_READAHEAD_PAGES;
bdi->io_pages = VM_READAHEAD_PAGES;
- timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
return bdi;
}
EXPORT_SYMBOL(bdi_alloc);
@@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- timer_delete_sync(&bdi->laptop_mode_wb_timer);
-
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
diff --git a/mm/balloon_compaction.c b/mm/balloon.c
index 03c5dbabb156..96a8f1e20bc6 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon.c
@@ -1,28 +1,62 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * mm/balloon_compaction.c
- *
- * Common interface for making balloon pages movable by compaction.
+ * Common interface for implementing a memory balloon, including support
+ * for migration of pages inflated in a memory balloon.
*
* Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/balloon_compaction.h>
+#include <linux/balloon.h>
+
+/*
+ * Lock protecting the balloon_dev_info of all devices. We don't really
+ * expect more than one device.
+ */
+static DEFINE_SPINLOCK(balloon_pages_lock);
+
+/**
+ * balloon_page_insert - insert a page into the balloon's page list and make
+ * the page->private assignment accordingly.
+ * @balloon : pointer to balloon device
+ * @page : page to be assigned as a 'balloon page'
+ *
+ * Caller must ensure the balloon_pages_lock is held.
+ */
+static void balloon_page_insert(struct balloon_dev_info *balloon,
+ struct page *page)
+{
+ lockdep_assert_held(&balloon_pages_lock);
+ __SetPageOffline(page);
+ if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) {
+ SetPageMovableOps(page);
+ set_page_private(page, (unsigned long)balloon);
+ }
+ list_add(&page->lru, &balloon->pages);
+}
+
+/**
+ * balloon_page_finalize - prepare a balloon page that was removed from the
+ * balloon list for release to the page allocator
+ * @page: page to be released to the page allocator
+ *
+ * Caller must ensure the balloon_pages_lock is held.
+ */
+static void balloon_page_finalize(struct page *page)
+{
+ lockdep_assert_held(&balloon_pages_lock);
+ if (IS_ENABLED(CONFIG_BALLOON_MIGRATION))
+ set_page_private(page, 0);
+ /* PageOffline is sticky until the page is freed to the buddy. */
+}
static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
struct page *page)
{
- /*
- * Block others from accessing the 'page' when we get around to
- * establishing additional references. We should be the only one
- * holding a reference to the 'page' at this point. If we are not, then
- * memory corruption is possible and we should stop execution.
- */
- BUG_ON(!trylock_page(page));
balloon_page_insert(b_dev_info, page);
- unlock_page(page);
+ if (b_dev_info->adjust_managed_page_count)
+ adjust_managed_page_count(page, -1);
__count_vm_event(BALLOON_INFLATE);
inc_node_page_state(page, NR_BALLOON_PAGES);
}
@@ -45,13 +79,13 @@ size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
unsigned long flags;
size_t n_pages = 0;
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ spin_lock_irqsave(&balloon_pages_lock, flags);
list_for_each_entry_safe(page, tmp, pages, lru) {
list_del(&page->lru);
balloon_page_enqueue_one(b_dev_info, page);
n_pages++;
}
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
return n_pages;
}
EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
@@ -81,34 +115,26 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
unsigned long flags;
size_t n_pages = 0;
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ spin_lock_irqsave(&balloon_pages_lock, flags);
list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
if (n_pages == n_req_pages)
break;
-
- /*
- * Block others from accessing the 'page' while we get around to
- * establishing additional references and preparing the 'page'
- * to be released by the balloon driver.
- */
- if (!trylock_page(page))
- continue;
-
list_del(&page->lru);
+ if (b_dev_info->adjust_managed_page_count)
+ adjust_managed_page_count(page, 1);
balloon_page_finalize(page);
__count_vm_event(BALLOON_DEFLATE);
list_add(&page->lru, pages);
- unlock_page(page);
dec_node_page_state(page, NR_BALLOON_PAGES);
n_pages++;
}
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
return n_pages;
}
EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
-/*
+/**
* balloon_page_alloc - allocates a new page for insertion into the balloon
* page list.
*
@@ -120,14 +146,18 @@ EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
*/
struct page *balloon_page_alloc(void)
{
- struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY |
- __GFP_NOWARN);
- return page;
+ gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+
+ if (IS_ENABLED(CONFIG_BALLOON_MIGRATION))
+ gfp_flags |= GFP_HIGHUSER_MOVABLE;
+ else
+ gfp_flags |= GFP_HIGHUSER;
+
+ return alloc_page(gfp_flags);
}
EXPORT_SYMBOL_GPL(balloon_page_alloc);
-/*
+/**
* balloon_page_enqueue - inserts a new page into the balloon page list.
*
* @b_dev_info: balloon device descriptor where we will insert a new page
@@ -136,22 +166,21 @@ EXPORT_SYMBOL_GPL(balloon_page_alloc);
* Drivers must call this function to properly enqueue a new allocated balloon
* page before definitively removing the page from the guest system.
*
- * Drivers must not call balloon_page_enqueue on pages that have been pushed to
- * a list with balloon_page_push before removing them with balloon_page_pop. To
- * enqueue a list of pages, use balloon_page_list_enqueue instead.
+ * Drivers must not enqueue pages while page->lru is still in
+ * use, and must not use page->lru until a page was unqueued again.
*/
void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
struct page *page)
{
unsigned long flags;
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ spin_lock_irqsave(&balloon_pages_lock, flags);
balloon_page_enqueue_one(b_dev_info, page);
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
}
EXPORT_SYMBOL_GPL(balloon_page_enqueue);
-/*
+/**
* balloon_page_dequeue - removes a page from balloon's page list and returns
* its address to allow the driver to release the page.
* @b_dev_info: balloon device descriptor where we will grab a page from.
@@ -187,32 +216,42 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
* BUG() here, otherwise the balloon driver may get stuck in
* an infinite loop while attempting to release all its pages.
*/
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ spin_lock_irqsave(&balloon_pages_lock, flags);
if (unlikely(list_empty(&b_dev_info->pages) &&
!b_dev_info->isolated_pages))
BUG();
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
return NULL;
}
return list_first_entry(&pages, struct page, lru);
}
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
-#ifdef CONFIG_BALLOON_COMPACTION
+#ifdef CONFIG_BALLOON_MIGRATION
+static struct balloon_dev_info *balloon_page_device(struct page *page)
+{
+ return (struct balloon_dev_info *)page_private(page);
+}
static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
{
- struct balloon_dev_info *b_dev_info = balloon_page_device(page);
+ struct balloon_dev_info *b_dev_info;
unsigned long flags;
- if (!b_dev_info)
+ spin_lock_irqsave(&balloon_pages_lock, flags);
+ b_dev_info = balloon_page_device(page);
+ if (!b_dev_info) {
+ /*
+ * The page already got deflated and removed from the
+ * balloon list.
+ */
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
return false;
-
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ }
list_del(&page->lru);
b_dev_info->isolated_pages++;
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
return true;
}
@@ -222,33 +261,75 @@ static void balloon_page_putback(struct page *page)
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
- /* Isolated balloon pages cannot get deflated. */
+ /*
+ * When we isolated the page, the page was still inflated in a balloon
+ * device. As isolated balloon pages cannot get deflated, we still have
+ * a balloon device here.
+ */
if (WARN_ON_ONCE(!b_dev_info))
return;
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ spin_lock_irqsave(&balloon_pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
}
-/* move_to_new_page() counterpart for a ballooned page */
static int balloon_page_migrate(struct page *newpage, struct page *page,
enum migrate_mode mode)
{
- struct balloon_dev_info *balloon = balloon_page_device(page);
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
+ unsigned long flags;
+ int rc;
- /* Isolated balloon pages cannot get deflated. */
- if (WARN_ON_ONCE(!balloon))
+ /*
+ * When we isolated the page, the page was still inflated in a balloon
+ * device. As isolated balloon pages cannot get deflated, we still have
+ * a balloon device here.
+ */
+ if (WARN_ON_ONCE(!b_dev_info))
return -EAGAIN;
- return balloon->migratepage(balloon, newpage, page, mode);
+ rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode);
+ if (rc < 0 && rc != -ENOENT)
+ return rc;
+
+ spin_lock_irqsave(&balloon_pages_lock, flags);
+ if (!rc) {
+ /* Insert the new page into the balloon list. */
+ get_page(newpage);
+ balloon_page_insert(b_dev_info, newpage);
+ __count_vm_event(BALLOON_MIGRATE);
+
+ if (b_dev_info->adjust_managed_page_count &&
+ page_zone(page) != page_zone(newpage)) {
+ /*
+ * When we migrate a page to a different zone we
+ * have to fixup the count of both involved zones.
+ */
+ adjust_managed_page_count(page, 1);
+ adjust_managed_page_count(newpage, -1);
+ }
+ } else {
+ /* Old page was deflated but new page not inflated. */
+ __count_vm_event(BALLOON_DEFLATE);
+
+ if (b_dev_info->adjust_managed_page_count)
+ adjust_managed_page_count(page, 1);
+ }
+
+ b_dev_info->isolated_pages--;
+
+ /* Free the now-deflated page we isolated in balloon_page_isolate(). */
+ balloon_page_finalize(page);
+ spin_unlock_irqrestore(&balloon_pages_lock, flags);
+
+ put_page(page);
+
+ return 0;
}
-const struct movable_operations balloon_mops = {
+static const struct movable_operations balloon_mops = {
.migrate_page = balloon_page_migrate,
.isolate_page = balloon_page_isolate,
.putback_page = balloon_page_putback,
@@ -260,4 +341,4 @@ static int __init balloon_init(void)
}
core_initcall(balloon_init);
-#endif /* CONFIG_BALLOON_COMPACTION */
+#endif /* CONFIG_BALLOON_MIGRATION */
diff --git a/mm/cma.c b/mm/cma.c
index 813e6dc7b095..94b5da468a7d 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/sizes.h>
#include <linux/slab.h>
+#include <linux/string.h>
#include <linux/string_choices.h>
#include <linux/log2.h>
#include <linux/cma.h>
@@ -233,7 +234,7 @@ static int __init cma_new_area(const char *name, phys_addr_t size,
cma_area_count++;
if (name)
- snprintf(cma->name, CMA_MAX_NAME, "%s", name);
+ strscpy(cma->name, name);
else
snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
@@ -836,7 +837,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
spin_unlock_irq(&cma->lock);
mutex_lock(&cma->alloc_mutex);
- ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
+ ret = alloc_contig_frozen_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
mutex_unlock(&cma->alloc_mutex);
if (!ret)
break;
@@ -856,8 +857,8 @@ out:
return ret;
}
-static struct page *__cma_alloc(struct cma *cma, unsigned long count,
- unsigned int align, gfp_t gfp)
+static struct page *__cma_alloc_frozen(struct cma *cma,
+ unsigned long count, unsigned int align, gfp_t gfp)
{
struct page *page = NULL;
int ret = -ENOMEM, r;
@@ -914,6 +915,21 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
return page;
}
+struct page *cma_alloc_frozen(struct cma *cma, unsigned long count,
+ unsigned int align, bool no_warn)
+{
+ gfp_t gfp = GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0);
+
+ return __cma_alloc_frozen(cma, count, align, gfp);
+}
+
+struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order)
+{
+ gfp_t gfp = GFP_KERNEL | __GFP_COMP | __GFP_NOWARN;
+
+ return __cma_alloc_frozen(cma, 1 << order, order, gfp);
+}
+
/**
* cma_alloc() - allocate pages from contiguous area
* @cma: Contiguous memory region for which the allocation is performed.
@@ -927,49 +943,60 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count,
struct page *cma_alloc(struct cma *cma, unsigned long count,
unsigned int align, bool no_warn)
{
- return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
-}
-
-struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
-{
struct page *page;
- if (WARN_ON(!order || !(gfp & __GFP_COMP)))
- return NULL;
-
- page = __cma_alloc(cma, 1 << order, order, gfp);
+ page = cma_alloc_frozen(cma, count, align, no_warn);
+ if (page)
+ set_pages_refcounted(page, count);
- return page ? page_folio(page) : NULL;
+ return page;
}
-bool cma_pages_valid(struct cma *cma, const struct page *pages,
- unsigned long count)
+static struct cma_memrange *find_cma_memrange(struct cma *cma,
+ const struct page *pages, unsigned long count)
{
- unsigned long pfn, end;
+ struct cma_memrange *cmr = NULL;
+ unsigned long pfn, end_pfn;
int r;
- struct cma_memrange *cmr;
- bool ret;
+
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
if (!cma || !pages || count > cma->count)
- return false;
+ return NULL;
pfn = page_to_pfn(pages);
- ret = false;
for (r = 0; r < cma->nranges; r++) {
cmr = &cma->ranges[r];
- end = cmr->base_pfn + cmr->count;
- if (pfn >= cmr->base_pfn && pfn < end) {
- ret = pfn + count <= end;
- break;
+ end_pfn = cmr->base_pfn + cmr->count;
+ if (pfn >= cmr->base_pfn && pfn < end_pfn) {
+ if (pfn + count <= end_pfn)
+ break;
+
+ VM_WARN_ON_ONCE(1);
}
}
- if (!ret)
- pr_debug("%s(page %p, count %lu)\n",
- __func__, (void *)pages, count);
+ if (r == cma->nranges) {
+ pr_debug("%s(page %p, count %lu, no cma range matches the page range)\n",
+ __func__, (void *)pages, count);
+ return NULL;
+ }
- return ret;
+ return cmr;
+}
+
+static void __cma_release_frozen(struct cma *cma, struct cma_memrange *cmr,
+ const struct page *pages, unsigned long count)
+{
+ unsigned long pfn = page_to_pfn(pages);
+
+ pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
+
+ free_contig_frozen_range(pfn, count);
+ cma_clear_bitmap(cma, cmr, pfn, count);
+ cma_sysfs_account_release_pages(cma, count);
+ trace_cma_release(cma->name, pfn, pages, count);
}
/**
@@ -986,43 +1013,33 @@ bool cma_release(struct cma *cma, const struct page *pages,
unsigned long count)
{
struct cma_memrange *cmr;
- unsigned long pfn, end_pfn;
- int r;
+ unsigned long i, pfn;
- pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
-
- if (!cma_pages_valid(cma, pages, count))
+ cmr = find_cma_memrange(cma, pages, count);
+ if (!cmr)
return false;
pfn = page_to_pfn(pages);
- end_pfn = pfn + count;
+ for (i = 0; i < count; i++, pfn++)
+ VM_WARN_ON(!put_page_testzero(pfn_to_page(pfn)));
- for (r = 0; r < cma->nranges; r++) {
- cmr = &cma->ranges[r];
- if (pfn >= cmr->base_pfn &&
- pfn < (cmr->base_pfn + cmr->count)) {
- VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count);
- break;
- }
- }
-
- if (r == cma->nranges)
- return false;
-
- free_contig_range(pfn, count);
- cma_clear_bitmap(cma, cmr, pfn, count);
- cma_sysfs_account_release_pages(cma, count);
- trace_cma_release(cma->name, pfn, pages, count);
+ __cma_release_frozen(cma, cmr, pages, count);
return true;
}
-bool cma_free_folio(struct cma *cma, const struct folio *folio)
+bool cma_release_frozen(struct cma *cma, const struct page *pages,
+ unsigned long count)
{
- if (WARN_ON(!folio_test_large(folio)))
+ struct cma_memrange *cmr;
+
+ cmr = find_cma_memrange(cma, pages, count);
+ if (!cmr)
return false;
- return cma_release(cma, &folio->page, folio_nr_pages(folio));
+ __cma_release_frozen(cma, cmr, pages, count);
+
+ return true;
}
int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 84f80a20f233..5e2724a4f285 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -157,6 +157,12 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
damon_free_region(r);
}
+static bool damon_is_last_region(struct damon_region *r,
+ struct damon_target *t)
+{
+ return list_is_last(&r->list, &t->regions_list);
+}
+
/*
* Check whether a region is intersecting an address range
*
@@ -197,7 +203,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* @t: the given target.
* @ranges: array of new monitoring target ranges.
* @nr_ranges: length of @ranges.
- * @min_sz_region: minimum region size.
+ * @min_region_sz: minimum region size.
*
* This function adds new regions to, or modify existing regions of a
* monitoring target to fit in specific ranges.
@@ -205,7 +211,7 @@ static int damon_fill_regions_holes(struct damon_region *first,
* Return: 0 if success, or negative error code otherwise.
*/
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
- unsigned int nr_ranges, unsigned long min_sz_region)
+ unsigned int nr_ranges, unsigned long min_region_sz)
{
struct damon_region *r, *next;
unsigned int i;
@@ -242,16 +248,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
/* no region intersects with this range */
newr = damon_new_region(
ALIGN_DOWN(range->start,
- min_sz_region),
- ALIGN(range->end, min_sz_region));
+ min_region_sz),
+ ALIGN(range->end, min_region_sz));
if (!newr)
return -ENOMEM;
damon_insert_region(newr, damon_prev_region(r), r, t);
} else {
/* resize intersecting regions to fit in this range */
first->ar.start = ALIGN_DOWN(range->start,
- min_sz_region);
- last->ar.end = ALIGN(range->end, min_sz_region);
+ min_region_sz);
+ last->ar.end = ALIGN(range->end, min_region_sz);
/* fill possible holes in the range */
err = damon_fill_regions_holes(first, last, t);
@@ -278,7 +284,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
}
/**
- * damos_filter_for_ops() - Return if the filter is ops-hndled one.
+ * damos_filter_for_ops() - Return if the filter is ops-handled one.
* @type: type of the filter.
*
* Return: true if the filter of @type needs to be handled by ops layer, false
@@ -395,6 +401,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
INIT_LIST_HEAD(&scheme->core_filters);
INIT_LIST_HEAD(&scheme->ops_filters);
scheme->stat = (struct damos_stat){};
+ scheme->max_nr_snapshots = 0;
INIT_LIST_HEAD(&scheme->list);
scheme->quota = *(damos_quota_init(quota));
@@ -546,7 +553,7 @@ struct damon_ctx *damon_new_ctx(void)
ctx->attrs.max_nr_regions = 1000;
ctx->addr_unit = 1;
- ctx->min_sz_region = DAMON_MIN_REGION;
+ ctx->min_region_sz = DAMON_MIN_REGION_SZ;
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -1072,7 +1079,11 @@ static int damos_commit(struct damos *dst, struct damos *src)
return err;
err = damos_commit_filters(dst, src);
- return err;
+ if (err)
+ return err;
+
+ dst->max_nr_snapshots = src->max_nr_snapshots;
+ return 0;
}
static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src)
@@ -1131,7 +1142,7 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx)
* If @src has no region, @dst keeps current regions.
*/
static int damon_commit_target_regions(struct damon_target *dst,
- struct damon_target *src, unsigned long src_min_sz_region)
+ struct damon_target *src, unsigned long src_min_region_sz)
{
struct damon_region *src_region;
struct damon_addr_range *ranges;
@@ -1148,7 +1159,7 @@ static int damon_commit_target_regions(struct damon_target *dst,
i = 0;
damon_for_each_region(src_region, src)
ranges[i++] = src_region->ar;
- err = damon_set_regions(dst, ranges, i, src_min_sz_region);
+ err = damon_set_regions(dst, ranges, i, src_min_region_sz);
kfree(ranges);
return err;
}
@@ -1156,11 +1167,11 @@ static int damon_commit_target_regions(struct damon_target *dst,
static int damon_commit_target(
struct damon_target *dst, bool dst_has_pid,
struct damon_target *src, bool src_has_pid,
- unsigned long src_min_sz_region)
+ unsigned long src_min_region_sz)
{
int err;
- err = damon_commit_target_regions(dst, src, src_min_sz_region);
+ err = damon_commit_target_regions(dst, src, src_min_region_sz);
if (err)
return err;
if (dst_has_pid)
@@ -1187,7 +1198,7 @@ static int damon_commit_targets(
err = damon_commit_target(
dst_target, damon_target_has_pid(dst),
src_target, damon_target_has_pid(src),
- src->min_sz_region);
+ src->min_region_sz);
if (err)
return err;
} else {
@@ -1214,7 +1225,7 @@ static int damon_commit_targets(
return -ENOMEM;
err = damon_commit_target(new_target, false,
src_target, damon_target_has_pid(src),
- src->min_sz_region);
+ src->min_region_sz);
if (err) {
damon_destroy_target(new_target, NULL);
return err;
@@ -1261,7 +1272,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
}
dst->ops = src->ops;
dst->addr_unit = src->addr_unit;
- dst->min_sz_region = src->min_sz_region;
+ dst->min_region_sz = src->min_region_sz;
return 0;
}
@@ -1294,8 +1305,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
if (ctx->attrs.min_nr_regions)
sz /= ctx->attrs.min_nr_regions;
- if (sz < ctx->min_sz_region)
- sz = ctx->min_sz_region;
+ if (sz < ctx->min_region_sz)
+ sz = ctx->min_region_sz;
return sz;
}
@@ -1431,6 +1442,23 @@ bool damon_is_running(struct damon_ctx *ctx)
return running;
}
+/**
+ * damon_kdamond_pid() - Return pid of a given DAMON context's worker thread.
+ * @ctx: The DAMON context of the question.
+ *
+ * Return: pid if @ctx is running, negative error code otherwise.
+ */
+int damon_kdamond_pid(struct damon_ctx *ctx)
+{
+ int pid = -EINVAL;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond)
+ pid = ctx->kdamond->pid;
+ mutex_unlock(&ctx->kdamond_lock);
+ return pid;
+}
+
/*
* damon_call_handle_inactive_ctx() - handle DAMON call request that added to
* an inactive context.
@@ -1604,7 +1632,7 @@ static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c)
adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) /
10000;
/*
- * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of
+ * adaptation_bp ranges from 1 to 20,000. Avoid too rapid reduction of
* the intervals by rescaling [1,10,000] to [5000, 10,000].
*/
if (adaptation_bp <= 10000)
@@ -1668,7 +1696,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* @t: The target of the region.
* @rp: The pointer to the region.
* @s: The scheme to be applied.
- * @min_sz_region: minimum region size.
+ * @min_region_sz: minimum region size.
*
* If a quota of a scheme has exceeded in a quota charge window, the scheme's
* action would applied to only a part of the target access pattern fulfilling
@@ -1686,7 +1714,8 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
* Return: true if the region should be entirely skipped, false otherwise.
*/
static bool damos_skip_charged_region(struct damon_target *t,
- struct damon_region **rp, struct damos *s, unsigned long min_sz_region)
+ struct damon_region **rp, struct damos *s,
+ unsigned long min_region_sz)
{
struct damon_region *r = *rp;
struct damos_quota *quota = &s->quota;
@@ -1708,11 +1737,11 @@ static bool damos_skip_charged_region(struct damon_target *t,
if (quota->charge_addr_from && r->ar.start <
quota->charge_addr_from) {
sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
- r->ar.start, min_sz_region);
+ r->ar.start, min_region_sz);
if (!sz_to_skip) {
- if (damon_sz_region(r) <= min_sz_region)
+ if (damon_sz_region(r) <= min_region_sz)
return true;
- sz_to_skip = min_sz_region;
+ sz_to_skip = min_region_sz;
}
damon_split_region_at(t, r, sz_to_skip);
r = damon_next_region(r);
@@ -1738,7 +1767,7 @@ static void damos_update_stat(struct damos *s,
static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
struct damon_region *r, struct damos_filter *filter,
- unsigned long min_sz_region)
+ unsigned long min_region_sz)
{
bool matched = false;
struct damon_target *ti;
@@ -1755,8 +1784,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
matched = target_idx == filter->target_idx;
break;
case DAMOS_FILTER_TYPE_ADDR:
- start = ALIGN_DOWN(filter->addr_range.start, min_sz_region);
- end = ALIGN_DOWN(filter->addr_range.end, min_sz_region);
+ start = ALIGN_DOWN(filter->addr_range.start, min_region_sz);
+ end = ALIGN_DOWN(filter->addr_range.end, min_region_sz);
/* inside the range */
if (start <= r->ar.start && r->ar.end <= end) {
@@ -1785,14 +1814,14 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
return matched == filter->matching;
}
-static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+static bool damos_core_filter_out(struct damon_ctx *ctx, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
struct damos_filter *filter;
s->core_filters_allowed = false;
damos_for_each_core_filter(filter, s) {
- if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
+ if (damos_filter_match(ctx, t, r, filter, ctx->min_region_sz)) {
if (filter->allow)
s->core_filters_allowed = true;
return !filter->allow;
@@ -1927,12 +1956,12 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (c->ops.apply_scheme) {
if (quota->esz && quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
- c->min_sz_region);
+ c->min_region_sz);
if (!sz)
goto update_stat;
damon_split_region_at(t, r, sz);
}
- if (damos_filter_out(c, t, r, s))
+ if (damos_core_filter_out(c, t, r, s))
return;
ktime_get_coarse_ts64(&begin);
trace_damos_before_apply(cidx, sidx, tidx, r,
@@ -1975,13 +2004,18 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
- if (damos_skip_charged_region(t, &r, s, c->min_sz_region))
+ if (damos_skip_charged_region(t, &r, s, c->min_region_sz))
continue;
- if (!damos_valid_target(c, t, r, s))
+ if (s->max_nr_snapshots &&
+ s->max_nr_snapshots <= s->stat.nr_snapshots)
continue;
- damos_apply_scheme(c, t, r, s);
+ if (damos_valid_target(c, t, r, s))
+ damos_apply_scheme(c, t, r, s);
+
+ if (damon_is_last_region(r, t))
+ s->stat.nr_snapshots++;
}
}
@@ -2078,16 +2112,13 @@ static unsigned long damos_get_node_memcg_used_bp(
unsigned long used_pages, numerator;
struct sysinfo i;
- rcu_read_lock();
- memcg = mem_cgroup_from_id(goal->memcg_id);
- if (!memcg || !mem_cgroup_tryget(memcg)) {
- rcu_read_unlock();
+ memcg = mem_cgroup_get_from_id(goal->memcg_id);
+ if (!memcg) {
if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
return 0;
else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
return 10000;
}
- rcu_read_unlock();
mem_cgroup_flush_stats(memcg);
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid));
@@ -2119,6 +2150,23 @@ static unsigned long damos_get_node_memcg_used_bp(
}
#endif
+/*
+ * Returns LRU-active or inactive memory to total LRU memory size ratio.
+ */
+static unsigned int damos_get_in_active_mem_bp(bool active_ratio)
+{
+ unsigned long active, inactive, total;
+
+ /* This should align with /proc/meminfo output */
+ active = global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON) +
+ global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE);
+ inactive = global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON) +
+ global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE);
+ total = active + inactive;
+ if (active_ratio)
+ return active * 10000 / total;
+ return inactive * 10000 / total;
+}
static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
{
@@ -2141,6 +2189,11 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
goal->current_value = damos_get_node_memcg_used_bp(goal);
break;
+ case DAMOS_QUOTA_ACTIVE_MEM_BP:
+ case DAMOS_QUOTA_INACTIVE_MEM_BP:
+ goal->current_value = damos_get_in_active_mem_bp(
+ goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP);
+ break;
default:
break;
}
@@ -2273,6 +2326,22 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
quota->min_score = score;
}
+static void damos_trace_stat(struct damon_ctx *c, struct damos *s)
+{
+ unsigned int cidx = 0, sidx = 0;
+ struct damos *siter;
+
+ if (!trace_damos_stat_after_apply_interval_enabled())
+ return;
+
+ damon_for_each_scheme(siter, c) {
+ if (siter == s)
+ break;
+ sidx++;
+ }
+ trace_damos_stat_after_apply_interval(cidx, sidx, &s->stat);
+}
+
static void kdamond_apply_schemes(struct damon_ctx *c)
{
struct damon_target *t;
@@ -2299,6 +2368,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
mutex_lock(&c->walk_control_lock);
damon_for_each_target(t, c) {
+ if (c->ops.target_valid && c->ops.target_valid(t) == false)
+ continue;
+
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
}
@@ -2311,6 +2383,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
s->last_applied = NULL;
+ damos_trace_stat(c, s);
}
mutex_unlock(&c->walk_control_lock);
}
@@ -2424,7 +2497,7 @@ static void damon_split_region_at(struct damon_target *t,
/* Split every region in the given target into 'nr_subs' regions */
static void damon_split_regions_of(struct damon_target *t, int nr_subs,
- unsigned long min_sz_region)
+ unsigned long min_region_sz)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
@@ -2434,13 +2507,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs,
sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
- sz_region > 2 * min_sz_region; i++) {
+ sz_region > 2 * min_region_sz; i++) {
/*
* Randomly select size of left sub-region to be at
* least 10 percent and at most 90% of original region
*/
sz_sub = ALIGN_DOWN(damon_rand(1, 10) *
- sz_region / 10, min_sz_region);
+ sz_region / 10, min_region_sz);
/* Do not allow blank region */
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
@@ -2480,7 +2553,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(t, nr_subregions, ctx->min_sz_region);
+ damon_split_regions_of(t, nr_subregions, ctx->min_region_sz);
last_nr_regions = nr_regions;
}
@@ -2577,41 +2650,30 @@ static void kdamond_usleep(unsigned long usecs)
*/
static void kdamond_call(struct damon_ctx *ctx, bool cancel)
{
- struct damon_call_control *control;
- LIST_HEAD(repeat_controls);
- int ret = 0;
+ struct damon_call_control *control, *next;
+ LIST_HEAD(controls);
- while (true) {
- mutex_lock(&ctx->call_controls_lock);
- control = list_first_entry_or_null(&ctx->call_controls,
- struct damon_call_control, list);
- mutex_unlock(&ctx->call_controls_lock);
- if (!control)
- break;
- if (cancel) {
+ mutex_lock(&ctx->call_controls_lock);
+ list_splice_tail_init(&ctx->call_controls, &controls);
+ mutex_unlock(&ctx->call_controls_lock);
+
+ list_for_each_entry_safe(control, next, &controls, list) {
+ if (!control->repeat || cancel)
+ list_del(&control->list);
+
+ if (cancel)
control->canceled = true;
- } else {
- ret = control->fn(control->data);
- control->return_code = ret;
- }
- mutex_lock(&ctx->call_controls_lock);
- list_del(&control->list);
- mutex_unlock(&ctx->call_controls_lock);
- if (!control->repeat) {
+ else
+ control->return_code = control->fn(control->data);
+
+ if (!control->repeat)
complete(&control->completion);
- } else if (control->canceled && control->dealloc_on_cancel) {
+ else if (control->canceled && control->dealloc_on_cancel)
kfree(control);
- continue;
- } else {
- list_add(&control->list, &repeat_controls);
- }
}
- control = list_first_entry_or_null(&repeat_controls,
- struct damon_call_control, list);
- if (!control || cancel)
- return;
+
mutex_lock(&ctx->call_controls_lock);
- list_add_tail(&control->list, &ctx->call_controls);
+ list_splice_tail(&controls, &ctx->call_controls);
mutex_unlock(&ctx->call_controls_lock);
}
@@ -2670,8 +2732,6 @@ static void kdamond_init_ctx(struct damon_ctx *ctx)
static int kdamond_fn(void *data)
{
struct damon_ctx *ctx = data;
- struct damon_target *t;
- struct damon_region *r, *next;
unsigned int max_nr_accesses = 0;
unsigned long sz_limit = 0;
@@ -2747,7 +2807,7 @@ static int kdamond_fn(void *data)
*
* Reset ->next_aggregation_sis to avoid that.
* It will anyway correctly updated after this
- * if caluse.
+ * if clause.
*/
ctx->next_aggregation_sis =
next_aggregation_sis;
@@ -2776,47 +2836,29 @@ static int kdamond_fn(void *data)
}
}
done:
- damon_for_each_target(t, ctx) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
+ damon_destroy_targets(ctx);
- if (ctx->ops.cleanup)
- ctx->ops.cleanup(ctx);
kfree(ctx->regions_score_histogram);
kdamond_call(ctx, true);
+ damos_walk_cancel(ctx);
pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
- damos_walk_cancel(ctx);
-
mutex_lock(&damon_lock);
nr_running_ctxs--;
if (!nr_running_ctxs && running_exclusive_ctxs)
running_exclusive_ctxs = false;
mutex_unlock(&damon_lock);
- damon_destroy_targets(ctx);
return 0;
}
-/*
- * struct damon_system_ram_region - System RAM resource address region of
- * [@start, @end).
- * @start: Start address of the region (inclusive).
- * @end: End address of the region (exclusive).
- */
-struct damon_system_ram_region {
- unsigned long start;
- unsigned long end;
-};
-
static int walk_system_ram(struct resource *res, void *arg)
{
- struct damon_system_ram_region *a = arg;
+ struct damon_addr_range *a = arg;
if (a->end - a->start < resource_size(res)) {
a->start = res->start;
@@ -2833,7 +2875,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
unsigned long *end)
{
- struct damon_system_ram_region arg = {};
+ struct damon_addr_range arg = {};
walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
if (arg.end <= arg.start)
@@ -2850,7 +2892,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
* @t: The monitoring target to set the region.
* @start: The pointer to the start address of the region.
* @end: The pointer to the end address of the region.
- * @min_sz_region: Minimum region size.
+ * @min_region_sz: Minimum region size.
*
* This function sets the region of @t as requested by @start and @end. If the
* values of @start and @end are zero, however, this function finds the biggest
@@ -2862,7 +2904,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
*/
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
- unsigned long min_sz_region)
+ unsigned long min_region_sz)
{
struct damon_addr_range addr_range;
@@ -2875,7 +2917,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
addr_range.start = *start;
addr_range.end = *end;
- return damon_set_regions(t, &addr_range, 1, min_sz_region);
+ return damon_set_regions(t, &addr_range, 1, min_region_sz);
}
/*
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 49b4bc294f4e..7bc5c0b2aea3 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -34,7 +34,7 @@ static bool enabled __read_mostly;
*
* Input parameters that updated while DAMON_LRU_SORT is running are not
* applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT
- * reads values of parametrs except ``enabled`` again. Once the re-reading is
+ * reads values of parameters except ``enabled`` again. Once the re-reading is
* done, this parameter is set as ``N``. If invalid parameters are found while
* the re-reading, DAMON_LRU_SORT will be disabled.
*/
@@ -42,6 +42,49 @@ static bool commit_inputs __read_mostly;
module_param(commit_inputs, bool, 0600);
/*
+ * Desired active to [in]active memory ratio in bp (1/10,000).
+ *
+ * While keeping the caps that set by other quotas, DAMON_LRU_SORT
+ * automatically increases and decreases the effective level of the quota
+ * aiming the LRU [de]prioritizations of the hot and cold memory resulting in
+ * this active to [in]active memory ratio. Value zero means disabling this
+ * auto-tuning feature.
+ *
+ * Disabled by default.
+ */
+static unsigned long active_mem_bp __read_mostly;
+module_param(active_mem_bp, ulong, 0600);
+
+/*
+ * Auto-tune monitoring intervals.
+ *
+ * If this parameter is set as ``Y``, DAMON_LRU_SORT automatically tunes
+ * DAMON's sampling and aggregation intervals. The auto-tuning aims to capture
+ * meaningful amount of access events in each DAMON-snapshot, while keeping the
+ * sampling interval 5 milliseconds in minimum, and 10 seconds in maximum.
+ * Setting this as ``N`` disables the auto-tuning.
+ *
+ * Disabled by default.
+ */
+static bool autotune_monitoring_intervals __read_mostly;
+module_param(autotune_monitoring_intervals, bool, 0600);
+
+/*
+ * Filter [non-]young pages accordingly for LRU [de]prioritizations.
+ *
+ * If this is set, check page level access (youngness) once again before each
+ * LRU [de]prioritization operation. LRU prioritization operation is skipped
+ * if the page has not accessed since the last check (not young). LRU
+ * deprioritization operation is skipped if the page has accessed since the
+ * last check (young). The feature is enabled or disabled if this parameter is
+ * set as ``Y`` or ``N``, respectively.
+ *
+ * Disabled by default.
+ */
+static bool filter_young_pages __read_mostly;
+module_param(filter_young_pages, bool, 0600);
+
+/*
* Access frequency threshold for hot memory regions identification in permil.
*
* If a memory region is accessed in frequency of this or higher,
@@ -71,7 +114,7 @@ static struct damos_quota damon_lru_sort_quota = {
/* Within the quota, mark hotter regions accessed first. */
.weight_sz = 0,
.weight_nr_accesses = 1,
- .weight_age = 0,
+ .weight_age = 1,
};
DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota);
@@ -193,10 +236,53 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
+static int damon_lru_sort_add_quota_goals(struct damos *hot_scheme,
+ struct damos *cold_scheme)
+{
+ struct damos_quota_goal *goal;
+
+ if (!active_mem_bp)
+ return 0;
+ goal = damos_new_quota_goal(DAMOS_QUOTA_ACTIVE_MEM_BP, active_mem_bp);
+ if (!goal)
+ return -ENOMEM;
+ damos_add_quota_goal(&hot_scheme->quota, goal);
+ /* aim 0.2 % goal conflict, to keep little ping pong */
+ goal = damos_new_quota_goal(DAMOS_QUOTA_INACTIVE_MEM_BP,
+ 10000 - active_mem_bp + 2);
+ if (!goal)
+ return -ENOMEM;
+ damos_add_quota_goal(&cold_scheme->quota, goal);
+ return 0;
+}
+
+static int damon_lru_sort_add_filters(struct damos *hot_scheme,
+ struct damos *cold_scheme)
+{
+ struct damos_filter *filter;
+
+ if (!filter_young_pages)
+ return 0;
+
+ /* disallow prioritizing not-young pages */
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, false, false);
+ if (!filter)
+ return -ENOMEM;
+ damos_add_filter(hot_scheme, filter);
+
+ /* disabllow de-prioritizing young pages */
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true, false);
+ if (!filter)
+ return -ENOMEM;
+ damos_add_filter(cold_scheme, filter);
+ return 0;
+}
+
static int damon_lru_sort_apply_parameters(void)
{
struct damon_ctx *param_ctx;
struct damon_target *param_target;
+ struct damon_attrs attrs;
struct damos *hot_scheme, *cold_scheme;
unsigned int hot_thres, cold_thres;
int err;
@@ -212,25 +298,34 @@ static int damon_lru_sort_apply_parameters(void)
if (!monitor_region_start && !monitor_region_end)
addr_unit = 1;
param_ctx->addr_unit = addr_unit;
- param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1);
+ param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1);
if (!damon_lru_sort_mon_attrs.sample_interval) {
err = -EINVAL;
goto out;
}
- err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs);
+ attrs = damon_lru_sort_mon_attrs;
+ if (autotune_monitoring_intervals) {
+ attrs.sample_interval = 5000;
+ attrs.aggr_interval = 100000;
+ attrs.intervals_goal.access_bp = 40;
+ attrs.intervals_goal.aggrs = 3;
+ attrs.intervals_goal.min_sample_us = 5000;
+ attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000;
+ }
+ err = damon_set_attrs(param_ctx, &attrs);
if (err)
goto out;
err = -ENOMEM;
- hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
+ hot_thres = damon_max_nr_accesses(&attrs) *
hot_thres_access_freq / 1000;
hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
if (!hot_scheme)
goto out;
- cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
+ cold_thres = cold_min_age / attrs.aggr_interval;
cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
if (!cold_scheme) {
damon_destroy_scheme(hot_scheme);
@@ -240,10 +335,17 @@ static int damon_lru_sort_apply_parameters(void)
damon_set_schemes(param_ctx, &hot_scheme, 1);
damon_add_scheme(param_ctx, cold_scheme);
+ err = damon_lru_sort_add_quota_goals(hot_scheme, cold_scheme);
+ if (err)
+ goto out;
+ err = damon_lru_sort_add_filters(hot_scheme, cold_scheme);
+ if (err)
+ goto out;
+
err = damon_set_region_biggest_system_ram_default(param_target,
&monitor_region_start,
&monitor_region_end,
- param_ctx->min_sz_region);
+ param_ctx->min_region_sz);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
@@ -303,7 +405,9 @@ static int damon_lru_sort_turn(bool on)
err = damon_start(&ctx, 1, true);
if (err)
return err;
- kdamond_pid = ctx->kdamond->pid;
+ kdamond_pid = damon_kdamond_pid(ctx);
+ if (kdamond_pid < 0)
+ return kdamond_pid;
return damon_call(ctx, &call_control);
}
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 07a8aead439e..9bfe48826840 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -156,7 +156,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r,
LIST_HEAD(folio_list);
bool install_young_filter = true;
struct damos_filter *filter;
- struct folio *folio;
+ struct folio *folio = NULL;
/* check access in page level again by default */
damos_for_each_ops_filter(filter, s) {
@@ -206,13 +206,13 @@ put_folio:
return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
-static inline unsigned long damon_pa_mark_accessed_or_deactivate(
+static inline unsigned long damon_pa_de_activate(
struct damon_region *r, unsigned long addr_unit,
- struct damos *s, bool mark_accessed,
+ struct damos *s, bool activate,
unsigned long *sz_filter_passed)
{
phys_addr_t addr, applied = 0;
- struct folio *folio;
+ struct folio *folio = NULL;
addr = damon_pa_phys_addr(r->ar.start, addr_unit);
while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
@@ -227,8 +227,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
else
*sz_filter_passed += folio_size(folio) / addr_unit;
- if (mark_accessed)
- folio_mark_accessed(folio);
+ if (activate)
+ folio_activate(folio);
else
folio_deactivate(folio);
applied += folio_nr_pages(folio);
@@ -240,20 +240,18 @@ put_folio:
return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit);
}
-static unsigned long damon_pa_mark_accessed(struct damon_region *r,
+static unsigned long damon_pa_activate_pages(struct damon_region *r,
unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true,
- sz_filter_passed);
+ return damon_pa_de_activate(r, addr_unit, s, true, sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
unsigned long addr_unit, struct damos *s,
unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false,
- sz_filter_passed);
+ return damon_pa_de_activate(r, addr_unit, s, false, sz_filter_passed);
}
static unsigned long damon_pa_migrate(struct damon_region *r,
@@ -262,7 +260,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r,
{
phys_addr_t addr, applied;
LIST_HEAD(folio_list);
- struct folio *folio;
+ struct folio *folio = NULL;
addr = damon_pa_phys_addr(r->ar.start, addr_unit);
while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) {
@@ -295,7 +293,7 @@ static unsigned long damon_pa_stat(struct damon_region *r,
unsigned long *sz_filter_passed)
{
phys_addr_t addr;
- struct folio *folio;
+ struct folio *folio = NULL;
if (!damos_ops_has_filter(s))
return 0;
@@ -327,7 +325,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
case DAMOS_PAGEOUT:
return damon_pa_pageout(r, aunit, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, aunit, scheme,
+ return damon_pa_activate_pages(r, aunit, scheme,
sz_filter_passed);
case DAMOS_LRU_DEPRIO:
return damon_pa_deactivate_pages(r, aunit, scheme,
@@ -375,7 +373,6 @@ static int __init damon_pa_initcall(void)
.prepare_access_checks = damon_pa_prepare_access_checks,
.check_accesses = damon_pa_check_accesses,
.target_valid = NULL,
- .cleanup = NULL,
.apply_scheme = damon_pa_apply_scheme,
.get_scheme_score = damon_pa_scheme_score,
};
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 36a582e09eae..43d76f5bed44 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -34,7 +34,7 @@ static bool enabled __read_mostly;
*
* Input parameters that updated while DAMON_RECLAIM is running are not applied
* by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
- * of parametrs except ``enabled`` again. Once the re-reading is done, this
+ * of parameters except ``enabled`` again. Once the re-reading is done, this
* parameter is set as ``N``. If invalid parameters are found while the
* re-reading, DAMON_RECLAIM will be disabled.
*/
@@ -208,7 +208,7 @@ static int damon_reclaim_apply_parameters(void)
if (!monitor_region_start && !monitor_region_end)
addr_unit = 1;
param_ctx->addr_unit = addr_unit;
- param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1);
+ param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1);
if (!damon_reclaim_mon_attrs.aggr_interval) {
err = -EINVAL;
@@ -251,7 +251,7 @@ static int damon_reclaim_apply_parameters(void)
err = damon_set_region_biggest_system_ram_default(param_target,
&monitor_region_start,
&monitor_region_end,
- param_ctx->min_sz_region);
+ param_ctx->min_region_sz);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
@@ -307,7 +307,9 @@ static int damon_reclaim_turn(bool on)
err = damon_start(&ctx, 1, true);
if (err)
return err;
- kdamond_pid = ctx->kdamond->pid;
+ kdamond_pid = damon_kdamond_pid(ctx);
+ if (kdamond_pid < 0)
+ return kdamond_pid;
return damon_call(ctx, &call_control);
}
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index ed8e3629d31a..bcf6c8ae9b90 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Shows data access monitoring resutls in simple metrics.
+ * Shows data access monitoring results in simple metrics.
*/
#define pr_fmt(fmt) "damon-stat: " fmt
@@ -34,7 +34,7 @@ module_param(estimated_memory_bandwidth, ulong, 0400);
MODULE_PARM_DESC(estimated_memory_bandwidth,
"Estimated memory bandwidth usage in bytes per second");
-static long memory_idle_ms_percentiles[101] __read_mostly = {0,};
+static long memory_idle_ms_percentiles[101] = {0,};
module_param_array(memory_idle_ms_percentiles, long, NULL, 0400);
MODULE_PARM_DESC(memory_idle_ms_percentiles,
"Memory idle time percentiles in milliseconds");
@@ -173,14 +173,6 @@ static struct damon_ctx *damon_stat_build_ctx(void)
if (damon_set_attrs(ctx, &attrs))
goto free_out;
- /*
- * auto-tune sampling and aggregation interval aiming 4% DAMON-observed
- * accesses ratio, keeping sampling interval in [5ms, 10s] range.
- */
- ctx->attrs.intervals_goal = (struct damon_intervals_goal) {
- .access_bp = 400, .aggrs = 3,
- .min_sample_us = 5000, .max_sample_us = 10000000,
- };
if (damon_select_ops(ctx, DAMON_OPS_PADDR))
goto free_out;
@@ -189,7 +181,7 @@ static struct damon_ctx *damon_stat_build_ctx(void)
goto free_out;
damon_add_target(ctx, target);
if (damon_set_region_biggest_system_ram_default(target, &start, &end,
- ctx->min_sz_region))
+ ctx->min_region_sz))
goto free_out;
return ctx;
free_out:
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 3a699dcd5a7f..2b05a6477188 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -204,6 +204,8 @@ struct damon_sysfs_stats {
unsigned long sz_applied;
unsigned long sz_ops_filter_passed;
unsigned long qt_exceeds;
+ unsigned long nr_snapshots;
+ unsigned long max_nr_snapshots;
};
static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
@@ -265,6 +267,37 @@ static ssize_t qt_exceeds_show(struct kobject *kobj,
return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
}
+static ssize_t nr_snapshots_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_snapshots);
+}
+
+static ssize_t max_nr_snapshots_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->max_nr_snapshots);
+}
+
+static ssize_t max_nr_snapshots_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+ unsigned long max_nr_snapshots, err = kstrtoul(buf, 0, &max_nr_snapshots);
+
+ if (err)
+ return err;
+ stats->max_nr_snapshots = max_nr_snapshots;
+ return count;
+}
+
static void damon_sysfs_stats_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
@@ -288,6 +321,12 @@ static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr =
static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
__ATTR_RO_MODE(qt_exceeds, 0400);
+static struct kobj_attribute damon_sysfs_stats_nr_snapshots_attr =
+ __ATTR_RO_MODE(nr_snapshots, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_max_nr_snapshots_attr =
+ __ATTR_RW_MODE(max_nr_snapshots, 0600);
+
static struct attribute *damon_sysfs_stats_attrs[] = {
&damon_sysfs_stats_nr_tried_attr.attr,
&damon_sysfs_stats_sz_tried_attr.attr,
@@ -295,6 +334,8 @@ static struct attribute *damon_sysfs_stats_attrs[] = {
&damon_sysfs_stats_sz_applied_attr.attr,
&damon_sysfs_stats_sz_ops_filter_passed_attr.attr,
&damon_sysfs_stats_qt_exceeds_attr.attr,
+ &damon_sysfs_stats_nr_snapshots_attr.attr,
+ &damon_sysfs_stats_max_nr_snapshots_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_stats);
@@ -1038,6 +1079,14 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
.metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
.name = "node_memcg_free_bp",
},
+ {
+ .metric = DAMOS_QUOTA_ACTIVE_MEM_BP,
+ .name = "active_mem_bp",
+ },
+ {
+ .metric = DAMOS_QUOTA_INACTIVE_MEM_BP,
+ .name = "inactive_mem_bp",
+ },
};
static ssize_t target_metric_show(struct kobject *kobj,
@@ -2288,7 +2337,6 @@ static ssize_t target_nid_store(struct kobject *kobj,
struct damon_sysfs_scheme, kobj);
int err = 0;
- /* TODO: error handling for target_nid range. */
err = kstrtoint(buf, 0, &scheme->target_nid);
return err ? err : count;
@@ -2454,7 +2502,7 @@ static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
return false;
}
-static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
+static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id)
{
struct mem_cgroup *memcg;
char *path;
@@ -2469,8 +2517,8 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
- /* skip removed memcg */
- if (!mem_cgroup_id(memcg))
+ /* skip offlined memcg */
+ if (!mem_cgroup_online(memcg))
continue;
if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
*id = mem_cgroup_id(memcg);
@@ -2719,6 +2767,7 @@ static struct damos *damon_sysfs_mk_scheme(
damon_destroy_scheme(scheme);
return NULL;
}
+ scheme->max_nr_snapshots = sysfs_scheme->stats->max_nr_snapshots;
return scheme;
}
@@ -2763,6 +2812,7 @@ void damon_sysfs_schemes_update_stats(
sysfs_stats->sz_ops_filter_passed =
scheme->stat.sz_ops_filter_passed;
sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ sysfs_stats->nr_snapshots = scheme->stat.nr_snapshots;
}
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 95fd9375a7d8..b7f66196bec4 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1365,7 +1365,7 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_regions *sysfs_regions,
- unsigned long min_sz_region)
+ unsigned long min_region_sz)
{
struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
@@ -1387,7 +1387,7 @@ static int damon_sysfs_set_regions(struct damon_target *t,
if (ranges[i - 1].end > ranges[i].start)
goto out;
}
- err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region);
+ err = damon_set_regions(t, ranges, sysfs_regions->nr, min_region_sz);
out:
kfree(ranges);
return err;
@@ -1409,7 +1409,8 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
return -EINVAL;
}
t->obsolete = sys_target->obsolete;
- return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region);
+ return damon_sysfs_set_regions(t, sys_target->regions,
+ ctx->min_region_sz);
}
static int damon_sysfs_add_targets(struct damon_ctx *ctx,
@@ -1469,8 +1470,8 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
ctx->addr_unit = sys_ctx->addr_unit;
/* addr_unit is respected by only DAMON_OPS_PADDR */
if (sys_ctx->ops_id == DAMON_OPS_PADDR)
- ctx->min_sz_region = max(
- DAMON_MIN_REGION / sys_ctx->addr_unit, 1);
+ ctx->min_region_sz = max(
+ DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1);
err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
if (err)
return err;
@@ -1819,10 +1820,9 @@ static ssize_t pid_show(struct kobject *kobj,
if (!ctx)
goto out;
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond)
- pid = ctx->kdamond->pid;
- mutex_unlock(&ctx->kdamond_lock);
+ pid = damon_kdamond_pid(ctx);
+ if (pid < 0)
+ pid = -1;
out:
mutex_unlock(&damon_sysfs_lock);
return sysfs_emit(buf, "%d\n", pid);
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 8cb369b63e08..92ea25e2dc9e 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -158,6 +158,7 @@ static void damon_test_split_at(struct kunit *test)
r->nr_accesses_bp = 420000;
r->nr_accesses = 42;
r->last_nr_accesses = 15;
+ r->age = 10;
damon_add_region(r, t);
damon_split_region_at(t, r, 25);
KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
@@ -170,6 +171,7 @@ static void damon_test_split_at(struct kunit *test)
KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, r_new->nr_accesses_bp);
KUNIT_EXPECT_EQ(test, r->nr_accesses, r_new->nr_accesses);
KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses);
+ KUNIT_EXPECT_EQ(test, r->age, r_new->age);
damon_free_target(t);
}
@@ -190,6 +192,7 @@ static void damon_test_merge_two(struct kunit *test)
}
r->nr_accesses = 10;
r->nr_accesses_bp = 100000;
+ r->age = 9;
damon_add_region(r, t);
r2 = damon_new_region(100, 300);
if (!r2) {
@@ -198,12 +201,15 @@ static void damon_test_merge_two(struct kunit *test)
}
r2->nr_accesses = 20;
r2->nr_accesses_bp = 200000;
+ r2->age = 21;
damon_add_region(r2, t);
damon_merge_two_regions(t, r, r2);
KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
KUNIT_EXPECT_EQ(test, r->ar.end, 300ul);
KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u);
+ KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, 160000u);
+ KUNIT_EXPECT_EQ(test, r->age, 17u);
i = 0;
damon_for_each_region(r3, t) {
@@ -232,12 +238,12 @@ static void damon_test_merge_regions_of(struct kunit *test)
{
struct damon_target *t;
struct damon_region *r;
- unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184};
- unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230};
- unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2};
+ unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184, 230};
+ unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230, 10170};
+ unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2, 5};
- unsigned long saddrs[] = {0, 114, 130, 156, 170};
- unsigned long eaddrs[] = {112, 130, 156, 170, 230};
+ unsigned long saddrs[] = {0, 114, 130, 156, 170, 230};
+ unsigned long eaddrs[] = {112, 130, 156, 170, 230, 10170};
int i;
t = damon_new_target();
@@ -255,9 +261,9 @@ static void damon_test_merge_regions_of(struct kunit *test)
}
damon_merge_regions_of(t, 9, 9999);
- /* 0-112, 114-130, 130-156, 156-170 */
- KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
- for (i = 0; i < 5; i++) {
+ /* 0-112, 114-130, 130-156, 156-170, 170-230, 230-10170 */
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 6u);
+ for (i = 0; i < 6; i++) {
r = __nth_region_of(t, i);
KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]);
KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]);
@@ -269,6 +275,9 @@ static void damon_test_split_regions_of(struct kunit *test)
{
struct damon_target *t;
struct damon_region *r;
+ unsigned long sa[] = {0, 300, 500};
+ unsigned long ea[] = {220, 400, 700};
+ int i;
t = damon_new_target();
if (!t)
@@ -295,6 +304,23 @@ static void damon_test_split_regions_of(struct kunit *test)
damon_split_regions_of(t, 4, 1);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
+
+ t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "third target alloc fail");
+ for (i = 0; i < ARRAY_SIZE(sa); i++) {
+ r = damon_new_region(sa[i], ea[i]);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
+ damon_add_region(r, t);
+ }
+ damon_split_regions_of(t, 4, 5);
+ KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u);
+ damon_for_each_region(r, t)
+ KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul);
+ damon_free_target(t);
}
static void damon_test_ops_registration(struct kunit *test)
@@ -574,9 +600,10 @@ static void damos_test_commit_quota_goal(struct kunit *test)
});
damos_test_commit_quota_goal_for(test, &dst,
&(struct damos_quota_goal) {
- .metric = DAMOS_QUOTA_USER_INPUT,
- .target_value = 789,
- .current_value = 12,
+ .metric = DAMOS_QUOTA_SOME_MEM_PSI_US,
+ .target_value = 234,
+ .current_value = 345,
+ .last_psi_total = 567,
});
}
@@ -1159,7 +1186,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test)
damos_set_filters_default_reject(&scheme);
/*
* A core-handled allow-filter is installed.
- * Rejct by default on core layer filtering stage due to the last
+ * Reject by default on core layer filtering stage due to the last
* core-layer-filter's behavior.
* Allow by default on ops layer filtering stage due to the absence of
* ops layer filters.
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 30dc5459f1d2..cfae870178bf 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -147,7 +147,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
damon_add_region(r, t);
}
- damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ);
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 23ed738a0bd6..83ab3d8c3792 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -19,8 +19,8 @@
#include "ops-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
-#undef DAMON_MIN_REGION
-#define DAMON_MIN_REGION 1
+#undef DAMON_MIN_REGION_SZ
+#define DAMON_MIN_REGION_SZ 1
#endif
/*
@@ -78,7 +78,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
orig_end = r->ar.end;
sz_orig = damon_sz_region(r);
- sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
+ sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION_SZ);
if (!sz_piece)
return -EINVAL;
@@ -161,12 +161,12 @@ next:
swap(first_gap, second_gap);
/* Store the result */
- regions[0].start = ALIGN(start, DAMON_MIN_REGION);
- regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION);
- regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
- regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
- regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
- regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);
+ regions[0].start = ALIGN(start, DAMON_MIN_REGION_SZ);
+ regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION_SZ);
+ regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION_SZ);
+ regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION_SZ);
+ regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION_SZ);
+ regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION_SZ);
return 0;
}
@@ -259,8 +259,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
sz += regions[i].end - regions[i].start;
if (ctx->attrs.min_nr_regions)
sz /= ctx->attrs.min_nr_regions;
- if (sz < DAMON_MIN_REGION)
- sz = DAMON_MIN_REGION;
+ if (sz < DAMON_MIN_REGION_SZ)
+ sz = DAMON_MIN_REGION_SZ;
/* Set the initial three regions of the target */
for (i = 0; i < 3; i++) {
@@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
if (damon_va_three_regions(t, three_regions))
continue;
- damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION);
+ damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ);
}
}
@@ -1014,7 +1014,6 @@ static int __init damon_va_initcall(void)
.check_accesses = damon_va_check_accesses,
.target_valid = damon_va_target_valid,
.cleanup_target = damon_va_cleanup_target,
- .cleanup = NULL,
.apply_scheme = damon_va_apply_scheme,
.get_scheme_score = damon_va_scheme_score,
};
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index ae9b9310d96f..83cf07269f13 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -971,22 +971,26 @@ static unsigned long __init get_random_vaddr(void)
return random_vaddr;
}
-static void __init destroy_args(struct pgtable_debug_args *args)
+static void __init
+debug_vm_pgtable_free_huge_page(struct pgtable_debug_args *args,
+ unsigned long pfn, int order)
{
- struct page *page = NULL;
+#ifdef CONFIG_CONTIG_ALLOC
+ if (args->is_contiguous_page) {
+ free_contig_range(pfn, 1 << order);
+ return;
+ }
+#endif
+ __free_pages(pfn_to_page(pfn), order);
+}
+static void __init destroy_args(struct pgtable_debug_args *args)
+{
/* Free (huge) page */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
has_transparent_pud_hugepage() &&
args->pud_pfn != ULONG_MAX) {
- if (args->is_contiguous_page) {
- free_contig_range(args->pud_pfn,
- (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)));
- } else {
- page = pfn_to_page(args->pud_pfn);
- __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
- }
-
+ debug_vm_pgtable_free_huge_page(args, args->pud_pfn, HPAGE_PUD_ORDER);
args->pud_pfn = ULONG_MAX;
args->pmd_pfn = ULONG_MAX;
args->pte_pfn = ULONG_MAX;
@@ -995,20 +999,13 @@ static void __init destroy_args(struct pgtable_debug_args *args)
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
has_transparent_hugepage() &&
args->pmd_pfn != ULONG_MAX) {
- if (args->is_contiguous_page) {
- free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER));
- } else {
- page = pfn_to_page(args->pmd_pfn);
- __free_pages(page, HPAGE_PMD_ORDER);
- }
-
+ debug_vm_pgtable_free_huge_page(args, args->pmd_pfn, HPAGE_PMD_ORDER);
args->pmd_pfn = ULONG_MAX;
args->pte_pfn = ULONG_MAX;
}
if (args->pte_pfn != ULONG_MAX) {
- page = pfn_to_page(args->pte_pfn);
- __free_page(page);
+ __free_page(pfn_to_page(args->pte_pfn));
args->pte_pfn = ULONG_MAX;
}
@@ -1242,8 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args)
*/
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
has_transparent_pud_hugepage()) {
- page = debug_vm_pgtable_alloc_huge_page(args,
- HPAGE_PUD_SHIFT - PAGE_SHIFT);
+ page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PUD_ORDER);
if (page) {
args->pud_pfn = page_to_pfn(page);
args->pmd_pfn = args->pud_pfn;
diff --git a/mm/dmapool_test.c b/mm/dmapool_test.c
index 54b1fd1ccfbb..e8172d708308 100644
--- a/mm/dmapool_test.c
+++ b/mm/dmapool_test.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/device.h>
#include <linux/dma-map-ops.h>
#include <linux/dma-mapping.h>
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index ff35b84a7b50..96c29b9dc85d 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,14 @@ static int __init early_ioremap_debug_setup(char *str)
}
early_param("early_ioremap_debug", early_ioremap_debug_setup);
+#define early_ioremap_dbg(fmt, args...) \
+ do { \
+ if (unlikely(early_ioremap_debug)) { \
+ pr_warn(fmt, ##args); \
+ dump_stack(); \
+ } \
+ } while (0)
+
static int after_paging_init __initdata;
pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
@@ -139,6 +147,9 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
if (WARN_ON(nrpages > NR_FIX_BTMAPS))
return NULL;
+ early_ioremap_dbg("%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, &phys_addr, size, slot, slot_virt[slot], offset);
+
/*
* Ok, go for it..
*/
@@ -152,8 +163,6 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
--idx;
--nrpages;
}
- WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
- __func__, &phys_addr, size, slot, offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
@@ -184,8 +193,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
__func__, addr, size, slot, prev_size[slot]))
return;
- WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n",
- __func__, addr, size, slot);
+ early_ioremap_dbg("%s(%p, %08lx) [%d]\n", __func__, addr, size, slot);
virt_addr = (unsigned long)addr;
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 45540942d148..a02179a0bded 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Compatibility functions which bloat the callers too much to make inline.
* All of the callers of these functions should be converted to use folios
diff --git a/mm/gup.c b/mm/gup.c
index 95d948c8e86c..8e7dc2c6ee73 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2806,17 +2806,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
return !reject_file_backed || shmem_mapping(mapping);
}
-static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,
- unsigned int flags, struct page **pages)
-{
- while ((*nr) - nr_start) {
- struct folio *folio = page_folio(pages[--(*nr)]);
-
- folio_clear_referenced(folio);
- gup_put_folio(folio, 1, flags);
- }
-}
-
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
/*
* GUP-fast relies on pte change detection to avoid concurrent pgtable
diff --git a/mm/gup_test.c b/mm/gup_test.c
index eeb3f4d87c51..9dd48db897b9 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/slab.h>
diff --git a/mm/highmem.c b/mm/highmem.c
index b5c8e4c2d5d4..a33e41183951 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -180,12 +180,13 @@ struct page *__kmap_to_page(void *vaddr)
for (i = 0; i < kctrl->idx; i++) {
unsigned long base_addr;
int idx;
+ pte_t pteval = kctrl->pteval[i];
idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
if (base_addr == base)
- return pte_page(kctrl->pteval[i]);
+ return pte_page(pteval);
}
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a6d37902b73d..0d487649e4de 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3431,7 +3431,7 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags)
if (!folio_test_anon(folio))
return;
for (;;) {
- remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
+ remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags);
i += folio_nr_pages(folio);
if (i >= nr)
break;
@@ -3944,7 +3944,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
int old_order = folio_order(folio);
struct folio *new_folio, *next;
int nr_shmem_dropped = 0;
- int remap_flags = 0;
+ enum ttu_flags ttu_flags = 0;
int ret;
pgoff_t end = 0;
@@ -4064,9 +4064,9 @@ fail:
shmem_uncharge(mapping->host, nr_shmem_dropped);
if (!ret && is_anon && !folio_is_device_private(folio))
- remap_flags = RMP_USE_SHARED_ZEROPAGE;
+ ttu_flags = TTU_USE_SHARED_ZEROPAGE;
- remap_page(folio, 1 << old_order, remap_flags);
+ remap_page(folio, 1 << old_order, ttu_flags);
/*
* Unlock all after-split folios except the one containing
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1832da0f623..0b005e944ee3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -121,16 +121,6 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool take_locks);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
-static void hugetlb_free_folio(struct folio *folio)
-{
- if (folio_test_hugetlb_cma(folio)) {
- hugetlb_cma_free_folio(folio);
- return;
- }
-
- folio_put(folio);
-}
-
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
if (spool->count)
@@ -588,8 +578,9 @@ hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
list_add(&nrg->link, rg);
coalesce_file_region(map, nrg);
- } else
+ } else {
*regions_needed += 1;
+ }
return to - from;
}
@@ -1257,8 +1248,9 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
if (vma_lock && vma_lock->vma != vma)
vma->vm_private_data = NULL;
- } else
+ } else {
vma->vm_private_data = NULL;
+ }
}
/*
@@ -1417,47 +1409,25 @@ err:
return NULL;
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-#ifdef CONFIG_CONTIG_ALLOC
-static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask,
+#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && defined(CONFIG_CONTIG_ALLOC)
+static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
struct folio *folio;
- bool retried = false;
-
-retry:
- folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask);
- if (!folio) {
- if (hugetlb_cma_exclusive_alloc())
- return NULL;
-
- folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
- if (!folio)
- return NULL;
- }
- if (folio_ref_freeze(folio, 1))
+ folio = hugetlb_cma_alloc_frozen_folio(order, gfp_mask, nid, nodemask);
+ if (folio)
return folio;
- pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
- hugetlb_free_folio(folio);
- if (!retried) {
- retried = true;
- goto retry;
- }
- return NULL;
-}
+ if (hugetlb_cma_exclusive_alloc())
+ return NULL;
-#else /* !CONFIG_CONTIG_ALLOC */
-static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
- nodemask_t *nodemask)
-{
- return NULL;
+ folio = (struct folio *)alloc_contig_frozen_pages(1 << order, gfp_mask,
+ nid, nodemask);
+ return folio;
}
-#endif /* CONFIG_CONTIG_ALLOC */
-
-#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
+#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE || !CONFIG_CONTIG_ALLOC */
+static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid,
nodemask_t *nodemask)
{
return NULL;
@@ -1587,9 +1557,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
if (unlikely(folio_test_hwpoison(folio)))
folio_clear_hugetlb_hwpoison(folio);
- folio_ref_unfreeze(folio, 1);
-
- hugetlb_free_folio(folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ if (folio_test_hugetlb_cma(folio))
+ hugetlb_cma_free_frozen_folio(folio);
+ else
+ free_frozen_pages(&folio->page, folio_order(folio));
}
/*
@@ -1869,7 +1841,7 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
return NULL;
}
-static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask,
+static struct folio *alloc_buddy_frozen_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry)
{
struct folio *folio;
@@ -1925,10 +1897,10 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
nid = numa_mem_id();
if (order_is_gigantic(order))
- folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask);
+ folio = alloc_gigantic_frozen_folio(order, gfp_mask, nid, nmask);
else
- folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask,
- node_alloc_noretry);
+ folio = alloc_buddy_frozen_folio(order, gfp_mask, nid, nmask,
+ node_alloc_noretry);
if (folio)
init_new_hugetlb_folio(folio);
return folio;
@@ -2106,8 +2078,9 @@ retry:
h->max_huge_pages++;
goto out;
}
- } else
+ } else {
rc = 0;
+ }
update_and_free_hugetlb_folio(h, folio, false);
return rc;
@@ -2702,11 +2675,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
* be consumed on a subsequent allocation.
*/
folio_set_hugetlb_restore_reserve(folio);
- } else
+ } else {
/*
* No reservation present, do nothing
*/
- vma_end_reservation(h, vma, address);
+ vma_end_reservation(h, vma, address);
+ }
}
}
@@ -2836,23 +2810,62 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
*/
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
{
- struct folio *folio;
+ unsigned long nr = 0;
+ struct page *page;
+ struct hstate *h;
+ LIST_HEAD(list);
int ret = 0;
- LIST_HEAD(isolate_list);
+ /* Avoid pfn iterations if no free non-gigantic huge pages */
+ for_each_hstate(h) {
+ if (hstate_is_gigantic(h))
+ continue;
+
+ nr += h->free_huge_pages;
+ if (nr)
+ break;
+ }
+
+ if (!nr)
+ return 0;
while (start_pfn < end_pfn) {
- folio = pfn_folio(start_pfn);
+ page = pfn_to_page(start_pfn);
+ nr = 1;
- /* Not to disrupt normal path by vainly holding hugetlb_lock */
- if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
- ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
- if (ret)
- break;
+ if (PageHuge(page) || PageCompound(page)) {
+ struct folio *folio = page_folio(page);
+
+ nr = folio_nr_pages(folio) - folio_page_idx(folio, page);
+
+ /*
+ * Don't disrupt normal path by vainly holding
+ * hugetlb_lock
+ */
+ if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
+ if (order_is_gigantic(folio_order(folio))) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ret = alloc_and_dissolve_hugetlb_folio(folio, &list);
+ if (ret)
+ break;
+
+ putback_movable_pages(&list);
+ }
+ } else if (PageBuddy(page)) {
+ /*
+ * Buddy order check without zone lock is unsafe and
+ * the order is maybe invalid, but race should be
+ * small, and the worst thing is skipping free hugetlb.
+ */
+ const unsigned int order = buddy_order_unsafe(page);
- putback_movable_pages(&isolate_list);
+ if (order <= MAX_PAGE_ORDER)
+ nr = 1UL << order;
}
- start_pfn++;
+ start_pfn += nr;
}
return ret;
@@ -3019,13 +3032,10 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
- if (map_chg) {
- spin_lock_irq(&hugetlb_lock);
- hugetlb_cgroup_uncharge_folio_rsvd(
- hstate_index(h), pages_per_huge_page(h),
- folio);
- spin_unlock_irq(&hugetlb_lock);
- }
+ spin_lock_irq(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_folio_rsvd(
+ hstate_index(h), pages_per_huge_page(h), folio);
+ spin_unlock_irq(&hugetlb_lock);
}
}
@@ -3425,6 +3435,13 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
&node_states[N_MEMORY], NULL);
+ if (!folio && !list_empty(&folio_list) &&
+ hugetlb_vmemmap_optimizable_size(h)) {
+ prep_and_add_allocated_folios(h, &folio_list);
+ INIT_LIST_HEAD(&folio_list);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
+ }
if (!folio)
break;
list_add(&folio->lru, &folio_list);
@@ -4159,7 +4176,6 @@ static int __init hugetlb_init(void)
}
}
- hugetlb_cma_check();
hugetlb_init_hstates();
gather_bootmem_prealloc();
report_hugepages();
@@ -4487,21 +4503,11 @@ void __init hugetlb_bootmem_set_nodes(void)
}
}
-static bool __hugetlb_bootmem_allocated __initdata;
-
-bool __init hugetlb_bootmem_allocated(void)
-{
- return __hugetlb_bootmem_allocated;
-}
-
void __init hugetlb_bootmem_alloc(void)
{
struct hstate *h;
int i;
- if (__hugetlb_bootmem_allocated)
- return;
-
hugetlb_bootmem_set_nodes();
for (i = 0; i < MAX_NUMNODES; i++)
@@ -4515,8 +4521,6 @@ void __init hugetlb_bootmem_alloc(void)
if (hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
-
- __hugetlb_bootmem_allocated = true;
}
/*
@@ -4718,10 +4722,12 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
if (vma_lock->vma != vma) {
vma->vm_private_data = NULL;
hugetlb_vma_lock_alloc(vma);
- } else
+ } else {
pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
- } else
+ }
+ } else {
hugetlb_vma_lock_alloc(vma);
+ }
}
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 58e895f3899a..792d06538fa9 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
*
* Copyright IBM Corporation, 2012
@@ -7,14 +8,6 @@
* Copyright (C) 2019 Red Hat, Inc.
* Author: Giuseppe Scrivano <gscrivan@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/cgroup.h>
@@ -822,7 +815,7 @@ hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
*cft = *tmpl;
/* rebuild the name */
- snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
+ scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
/* rebuild the private */
cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
/* rebuild the file_offset */
diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c
index e8e4dc7182d5..f83ae4998990 100644
--- a/mm/hugetlb_cma.c
+++ b/mm/hugetlb_cma.c
@@ -13,42 +13,46 @@
#include "hugetlb_cma.h"
-static struct cma *hugetlb_cma[MAX_NUMNODES];
+static struct cma *hugetlb_cma[MAX_NUMNODES] __ro_after_init;
static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-static bool hugetlb_cma_only;
-static unsigned long hugetlb_cma_size __initdata;
+static bool hugetlb_cma_only __ro_after_init;
+static unsigned long hugetlb_cma_size __ro_after_init;
-void hugetlb_cma_free_folio(struct folio *folio)
+void hugetlb_cma_free_frozen_folio(struct folio *folio)
{
- int nid = folio_nid(folio);
-
- WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio));
+ WARN_ON_ONCE(!cma_release_frozen(hugetlb_cma[folio_nid(folio)],
+ &folio->page, folio_nr_pages(folio)));
}
-
-struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
int node;
- struct folio *folio = NULL;
+ struct folio *folio;
+ struct page *page = NULL;
+
+ if (!hugetlb_cma_size)
+ return NULL;
if (hugetlb_cma[nid])
- folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
+ page = cma_alloc_frozen_compound(hugetlb_cma[nid], order);
- if (!folio && !(gfp_mask & __GFP_THISNODE)) {
+ if (!page && !(gfp_mask & __GFP_THISNODE)) {
for_each_node_mask(node, *nodemask) {
if (node == nid || !hugetlb_cma[node])
continue;
- folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
- if (folio)
+ page = cma_alloc_frozen_compound(hugetlb_cma[node], order);
+ if (page)
break;
}
}
- if (folio)
- folio_set_hugetlb_cma(folio);
+ if (!page)
+ return NULL;
+ folio = page_folio(page);
+ folio_set_hugetlb_cma(folio);
return folio;
}
@@ -85,9 +89,6 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact)
return m;
}
-
-static bool cma_reserve_called __initdata;
-
static int __init cmdline_parse_hugetlb_cma(char *p)
{
int nid, count = 0;
@@ -134,12 +135,26 @@ static int __init cmdline_parse_hugetlb_cma_only(char *p)
early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only);
-void __init hugetlb_cma_reserve(int order)
+unsigned int __weak arch_hugetlb_cma_order(void)
{
- unsigned long size, reserved, per_node;
+ return 0;
+}
+
+void __init hugetlb_cma_reserve(void)
+{
+ unsigned long size, reserved, per_node, order;
bool node_specific_cma_alloc = false;
int nid;
+ if (!hugetlb_cma_size)
+ return;
+
+ order = arch_hugetlb_cma_order();
+ if (!order) {
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+ return;
+ }
+
/*
* HugeTLB CMA reservation is required for gigantic
* huge pages which could not be allocated via the
@@ -147,10 +162,6 @@ void __init hugetlb_cma_reserve(int order)
* breaking this assumption.
*/
VM_WARN_ON(order <= MAX_PAGE_ORDER);
- cma_reserve_called = true;
-
- if (!hugetlb_cma_size)
- return;
hugetlb_bootmem_set_nodes();
@@ -244,14 +255,6 @@ void __init hugetlb_cma_reserve(int order)
hugetlb_cma_size = 0;
}
-void __init hugetlb_cma_check(void)
-{
- if (!hugetlb_cma_size || cma_reserve_called)
- return;
-
- pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
-}
-
bool hugetlb_cma_exclusive_alloc(void)
{
return hugetlb_cma_only;
diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h
index 2c2ec8a7e134..c619c394b1ae 100644
--- a/mm/hugetlb_cma.h
+++ b/mm/hugetlb_cma.h
@@ -3,23 +3,22 @@
#define _LINUX_HUGETLB_CMA_H
#ifdef CONFIG_CMA
-void hugetlb_cma_free_folio(struct folio *folio);
-struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
+void hugetlb_cma_free_frozen_folio(struct folio *folio);
+struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask,
int nid, nodemask_t *nodemask);
struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
bool node_exact);
-void hugetlb_cma_check(void);
bool hugetlb_cma_exclusive_alloc(void);
unsigned long hugetlb_cma_total_size(void);
void hugetlb_cma_validate_params(void);
bool hugetlb_early_cma(struct hstate *h);
#else
-static inline void hugetlb_cma_free_folio(struct folio *folio)
+static inline void hugetlb_cma_free_frozen_folio(struct folio *folio)
{
}
-static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+static inline struct folio *hugetlb_cma_alloc_frozen_folio(int order,
+ gfp_t gfp_mask, int nid, nodemask_t *nodemask)
{
return NULL;
}
@@ -31,10 +30,6 @@ struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid,
return NULL;
}
-static inline void hugetlb_cma_check(void)
-{
-}
-
static inline bool hugetlb_cma_exclusive_alloc(void)
{
return false;
diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c
index bd3077150542..e74cf18ad431 100644
--- a/mm/hugetlb_sysctl.c
+++ b/mm/hugetlb_sysctl.c
@@ -8,6 +8,8 @@
#include "hugetlb_internal.h"
+int movable_gigantic_pages;
+
#ifdef CONFIG_SYSCTL
static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
void *buffer, size_t *length,
@@ -125,6 +127,15 @@ static const struct ctl_table hugetlb_table[] = {
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
},
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+ {
+ .procname = "movable_gigantic_pages",
+ .data = &movable_gigantic_pages,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
};
void __init hugetlb_sysctl_init(void)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 9d01f883fd71..a9280259e12a 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -794,14 +794,6 @@ void __init hugetlb_vmemmap_init_early(int nid)
struct huge_bootmem_page *m = NULL;
void *map;
- /*
- * Noting to do if bootmem pages were not allocated
- * early in boot, or if HVO wasn't enabled in the
- * first place.
- */
- if (!hugetlb_bootmem_allocated())
- return;
-
if (!READ_ONCE(vmemmap_optimize_enabled))
return;
@@ -847,9 +839,6 @@ void __init hugetlb_vmemmap_init_late(int nid)
struct hstate *h;
void *map;
- if (!hugetlb_bootmem_allocated())
- return;
-
if (!READ_ONCE(vmemmap_optimize_enabled))
return;
diff --git a/mm/internal.h b/mm/internal.h
index aacda4f79534..aee1f72ef6ed 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -171,7 +171,7 @@ static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
/*
* OK, we tried to call the file hook for mmap(), but an error
- * arose. The mapping is in an inconsistent state and we most not invoke
+ * arose. The mapping is in an inconsistent state and we must not invoke
* any further hooks on it.
*/
vma->vm_ops = &vma_dummy_vm_ops;
@@ -199,6 +199,73 @@ static inline void vma_close(struct vm_area_struct *vma)
#ifdef CONFIG_MMU
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+ atomic_inc(&anon_vma->refcount);
+}
+
+void __put_anon_vma(struct anon_vma *anon_vma);
+
+static inline void put_anon_vma(struct anon_vma *anon_vma)
+{
+ if (atomic_dec_and_test(&anon_vma->refcount))
+ __put_anon_vma(anon_vma);
+}
+
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
+{
+ down_write(&anon_vma->root->rwsem);
+}
+
+static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
+{
+ return down_write_trylock(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
+{
+ up_write(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+ down_read(&anon_vma->root->rwsem);
+}
+
+static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
+{
+ return down_read_trylock(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+ up_read(&anon_vma->root->rwsem);
+}
+
+struct anon_vma *folio_get_anon_vma(const struct folio *folio);
+
+/* Operations which modify VMAs. */
+enum vma_operation {
+ VMA_OP_SPLIT,
+ VMA_OP_MERGE_UNFAULTED,
+ VMA_OP_REMAP,
+ VMA_OP_FORK,
+};
+
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation);
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
+int __anon_vma_prepare(struct vm_area_struct *vma);
+void unlink_anon_vmas(struct vm_area_struct *vma);
+
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+ if (likely(vma->anon_vma))
+ return 0;
+
+ return __anon_vma_prepare(vma);
+}
+
/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;
@@ -513,6 +580,14 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
+static inline void set_pages_refcounted(struct page *page, unsigned long nr_pages)
+{
+ unsigned long pfn = page_to_pfn(page);
+
+ for (; nr_pages--; pfn++)
+ set_page_refcounted(pfn_to_page(pfn));
+}
+
/*
* Return true if a folio needs ->release_folio() calling upon it.
*/
@@ -853,6 +928,12 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
unsigned long, enum meminit_context, struct vmem_altmap *, int,
bool);
+#ifdef CONFIG_SPARSEMEM
+void sparse_init(void);
+#else
+static inline void sparse_init(void) {}
+#endif /* CONFIG_SPARSEMEM */
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -929,9 +1010,14 @@ void init_cma_reserved_pageblock(struct page *page);
struct cma;
#ifdef CONFIG_CMA
+bool cma_validate_zones(struct cma *cma);
void *cma_reserve_early(struct cma *cma, unsigned long size);
void init_cma_pageblock(struct page *page);
#else
+static inline bool cma_validate_zones(struct cma *cma)
+{
+ return false;
+}
static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
{
return NULL;
@@ -1658,24 +1744,6 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
pgd_t *pgd, void *private);
-/* pt_reclaim.c */
-bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
-void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
- pmd_t pmdval);
-void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
- struct mmu_gather *tlb);
-
-#ifdef CONFIG_PT_RECLAIM
-bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
- struct zap_details *details);
-#else
-static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
- struct zap_details *details)
-{
- return false;
-}
-#endif /* CONFIG_PT_RECLAIM */
-
void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 2cafca31b092..b4d157962121 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1840,6 +1840,29 @@ static void vmalloc_helpers_tags(struct kunit *test)
vfree(ptr);
}
+static void vmalloc_oob_helper(struct kunit *test, char *v_ptr, size_t size)
+{
+ /*
+ * We have to be careful not to hit the guard page in vmalloc tests.
+ * The MMU will catch that and crash us.
+ */
+
+ /* Make sure in-bounds accesses are valid. */
+ v_ptr[0] = 0;
+ v_ptr[size - 1] = 0;
+
+ /*
+ * An unaligned access past the requested vmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+
+ /* An aligned access into the first out-of-bounds granule. */
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size]);
+}
+
static void vmalloc_oob(struct kunit *test)
{
char *v_ptr, *p_ptr;
@@ -1856,24 +1879,21 @@ static void vmalloc_oob(struct kunit *test)
OPTIMIZER_HIDE_VAR(v_ptr);
- /*
- * We have to be careful not to hit the guard page in vmalloc tests.
- * The MMU will catch that and crash us.
- */
+ vmalloc_oob_helper(test, v_ptr, size);
- /* Make sure in-bounds accesses are valid. */
- v_ptr[0] = 0;
- v_ptr[size - 1] = 0;
+ size -= KASAN_GRANULE_SIZE + 1;
+ v_ptr = vrealloc(v_ptr, size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
- /*
- * An unaligned access past the requested vmalloc size.
- * Only generic KASAN can precisely detect these.
- */
- if (IS_ENABLED(CONFIG_KASAN_GENERIC))
- KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+ OPTIMIZER_HIDE_VAR(v_ptr);
- /* An aligned access into the first out-of-bounds granule. */
- KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]);
+ vmalloc_oob_helper(test, v_ptr, size);
+
+ size += 2 * KASAN_GRANULE_SIZE + 2;
+ v_ptr = vrealloc(v_ptr, size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ vmalloc_oob_helper(test, v_ptr, size);
/* Check that in-bounds accesses to the physical page are valid. */
page = vmalloc_to_page(v_ptr);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 62c01b4527eb..27efb78eb32d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -203,7 +203,7 @@ static inline void fail_non_kasan_kunit_test(void) { }
static DEFINE_RAW_SPINLOCK(report_lock);
-static void start_report(unsigned long *flags, bool sync)
+static void start_report(unsigned long *flags)
{
fail_non_kasan_kunit_test();
/* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
@@ -543,7 +543,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
if (unlikely(!report_enabled()))
return;
- start_report(&flags, true);
+ start_report(&flags);
__memset(&info, 0, sizeof(info));
info.type = type;
@@ -581,7 +581,7 @@ bool kasan_report(const void *addr, size_t size, bool is_write,
goto out;
}
- start_report(&irq_flags, true);
+ start_report(&irq_flags);
__memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
@@ -615,7 +615,7 @@ void kasan_report_async(void)
if (unlikely(!report_enabled()))
return;
- start_report(&flags, false);
+ start_report(&flags);
pr_err("BUG: KASAN: invalid-access\n");
pr_err("Asynchronous fault: no details available\n");
pr_err("\n");
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 32fbdf759ea2..d286e0a04543 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
pte_t pte;
int index;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_pause();
index = PFN_DOWN(addr - data->start);
page = data->pages[index];
@@ -319,7 +319,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
}
spin_unlock(&init_mm.page_table_lock);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_resume();
return 0;
}
@@ -471,7 +471,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
pte_t pte;
int none;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_pause();
spin_lock(&init_mm.page_table_lock);
pte = ptep_get(ptep);
@@ -483,7 +483,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
if (likely(!none))
__free_page(pfn_to_page(pte_pfn(pte)));
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_resume();
return 0;
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 653e162fa494..b4ea3262c925 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -905,7 +905,7 @@ static void toggle_allocation_gate(struct work_struct *work)
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
#endif
- queue_delayed_work(system_unbound_wq, &kfence_timer,
+ queue_delayed_work(system_dfl_wq, &kfence_timer,
msecs_to_jiffies(kfence_sample_interval));
}
@@ -955,7 +955,7 @@ static void kfence_init_enable(void)
#endif
WRITE_ONCE(kfence_enabled, true);
- queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
+ queue_delayed_work(system_dfl_wq, &kfence_timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
@@ -1051,7 +1051,7 @@ static int kfence_enable_late(void)
return kfence_init_late();
WRITE_ONCE(kfence_enabled, true);
- queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
+ queue_delayed_work(system_dfl_wq, &kfence_timer, 0);
pr_info("re-enabled\n");
return 0;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 97d1b2824386..1b8faae5b448 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -22,6 +22,7 @@
#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/pgalloc.h>
+#include <linux/backing-dev.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -58,6 +59,7 @@ enum scan_result {
SCAN_STORE_FAILED,
SCAN_COPY_MC,
SCAN_PAGE_FILLED,
+ SCAN_PAGE_DIRTY_OR_WRITEBACK,
};
#define CREATE_TRACE_POINTS
@@ -535,17 +537,16 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long start_addr,
- pte_t *pte,
- struct collapse_control *cc,
- struct list_head *compound_pagelist)
+static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
+ struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr = start_addr;
pte_t *_pte;
- int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
+ int none_or_zero = 0, shared = 0, referenced = 0;
+ enum scan_result result = SCAN_FAIL;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
@@ -778,13 +779,13 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* @ptl: lock on raw pages' PTEs
* @compound_pagelist: list that stores compound pages
*/
-static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
+static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
unsigned long address, spinlock_t *ptl,
struct list_head *compound_pagelist)
{
unsigned int i;
- int result = SCAN_SUCCEED;
+ enum scan_result result = SCAN_SUCCEED;
/*
* Copying pages' contents is subject to memory poison at any iteration.
@@ -826,7 +827,7 @@ static void khugepaged_alloc_sleep(void)
remove_wait_queue(&khugepaged_wait, &wait);
}
-struct collapse_control khugepaged_collapse_control = {
+static struct collapse_control khugepaged_collapse_control = {
.is_khugepaged = true,
};
@@ -896,10 +897,8 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
* Returns enum scan_result value.
*/
-static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- bool expect_anon,
- struct vm_area_struct **vmap,
- struct collapse_control *cc)
+static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+ bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc)
{
struct vm_area_struct *vma;
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
@@ -928,7 +927,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-static inline int check_pmd_state(pmd_t *pmd)
+static inline enum scan_result check_pmd_state(pmd_t *pmd)
{
pmd_t pmde = pmdp_get_lockless(pmd);
@@ -951,9 +950,8 @@ static inline int check_pmd_state(pmd_t *pmd)
return SCAN_SUCCEED;
}
-static int find_pmd_or_thp_or_none(struct mm_struct *mm,
- unsigned long address,
- pmd_t **pmd)
+static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address, pmd_t **pmd)
{
*pmd = mm_find_pmd(mm, address);
if (!*pmd)
@@ -962,12 +960,11 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
return check_pmd_state(*pmd);
}
-static int check_pmd_still_valid(struct mm_struct *mm,
- unsigned long address,
- pmd_t *pmd)
+static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmd)
{
pmd_t *new_pmd;
- int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
+ enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
if (result != SCAN_SUCCEED)
return result;
@@ -983,15 +980,14 @@ static int check_pmd_still_valid(struct mm_struct *mm,
* Called and returns without pte mapped or spinlocks held.
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
-static int __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start_addr, pmd_t *pmd,
- int referenced)
+static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
+ int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
- int result;
+ enum scan_result result;
pte_t *pte = NULL;
spinlock_t *ptl;
@@ -1060,8 +1056,8 @@ out:
return result;
}
-static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
- struct collapse_control *cc)
+static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
+ struct collapse_control *cc)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
@@ -1088,9 +1084,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
return SCAN_SUCCEED;
}
-static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped,
- struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped, struct collapse_control *cc)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
@@ -1098,7 +1093,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pgtable_t pgtable;
struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl;
- int result = SCAN_FAIL;
+ enum scan_result result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
@@ -1244,15 +1239,14 @@ out_nolock:
return result;
}
-static int hpage_collapse_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start_addr, bool *mmap_locked,
- struct collapse_control *cc)
+static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int result = SCAN_FAIL, referenced = 0;
- int none_or_zero = 0, shared = 0;
+ int none_or_zero = 0, shared = 0, referenced = 0;
+ enum scan_result result = SCAN_FAIL;
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr;
@@ -1439,8 +1433,8 @@ static void collect_mm_slot(struct mm_slot *slot)
}
/* folio must be locked, and mmap_lock must be held */
-static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct folio *folio, struct page *page)
+static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_fault vmf = {
@@ -1475,22 +1469,11 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED;
}
-/**
- * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
- * address haddr.
- *
- * @mm: process address space where collapse happens
- * @addr: THP collapse address
- * @install_pmd: If a huge PMD should be installed
- *
- * This function checks whether all the PTEs in the PMD are pointing to the
- * right THP. If so, retract the page table so the THP can refault in with
- * as pmd-mapped. Possibly install a huge PMD mapping the THP.
- */
-int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
- bool install_pmd)
+static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd)
{
- int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ enum scan_result result = SCAN_FAIL;
+ int nr_mapped_ptes = 0;
unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
@@ -1709,6 +1692,24 @@ drop_folio:
return result;
}
+/**
+ * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
+ * address haddr.
+ *
+ * @mm: process address space where collapse happens
+ * @addr: THP collapse address
+ * @install_pmd: If a huge PMD should be installed
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped. Possibly install a huge PMD mapping the THP.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd)
+{
+ try_collapse_pte_mapped_thp(mm, addr, install_pmd);
+}
+
/* Can we retract page tables for this file-backed VMA? */
static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
{
@@ -1854,9 +1855,8 @@ drop_pml:
* + unlock old pages
* + unlock and free huge page;
*/
-static int collapse_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
+static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start, struct collapse_control *cc)
{
struct address_space *mapping = file->f_mapping;
struct page *dst;
@@ -1864,7 +1864,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
- int nr_none = 0, result = SCAN_SUCCEED;
+ enum scan_result result = SCAN_SUCCEED;
+ int nr_none = 0;
bool is_shmem = shmem_file(file);
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
@@ -1967,11 +1968,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
*/
xas_unlock_irq(&xas);
filemap_flush(mapping);
- result = SCAN_FAIL;
+ result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
goto xa_unlocked;
} else if (folio_test_writeback(folio)) {
xas_unlock_irq(&xas);
- result = SCAN_FAIL;
+ result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
goto xa_unlocked;
} else if (folio_trylock(folio)) {
folio_get(folio);
@@ -2018,7 +2019,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* folio is dirty because it hasn't been flushed
* since first write.
*/
- result = SCAN_FAIL;
+ result = SCAN_PAGE_DIRTY_OR_WRITEBACK;
goto out_unlock;
}
@@ -2194,16 +2195,13 @@ immap_locked:
xas_lock_irq(&xas);
}
- if (is_shmem)
+ if (is_shmem) {
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM, HPAGE_PMD_NR);
lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
- else
+ } else {
lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
-
- if (nr_none) {
- lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
- /* nr_none is always 0 for non-shmem. */
- lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
}
+ lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, HPAGE_PMD_NR);
/*
* Mark new_folio as uptodate before inserting it into the
@@ -2225,7 +2223,7 @@ immap_locked:
/*
* Remove pte page tables, so we can re-fault the page as huge.
- * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
+ * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp().
*/
retract_page_tables(mapping, start);
if (cc && !cc->is_khugepaged)
@@ -2237,6 +2235,11 @@ immap_locked:
*/
list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
list_del(&folio->lru);
+ lruvec_stat_mod_folio(folio, NR_FILE_PAGES,
+ -folio_nr_pages(folio));
+ if (is_shmem)
+ lruvec_stat_mod_folio(folio, NR_SHMEM,
+ -folio_nr_pages(folio));
folio->mapping = NULL;
folio_clear_active(folio);
folio_clear_unevictable(folio);
@@ -2285,16 +2288,15 @@ out:
return result;
}
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
+static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start, struct collapse_control *cc)
{
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
- int result = SCAN_SUCCEED;
+ enum scan_result result = SCAN_SUCCEED;
present = 0;
swap = 0;
@@ -2392,7 +2394,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
return result;
}
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
@@ -2440,14 +2442,15 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
break;
}
if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
-skip:
progress++;
continue;
}
hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
- if (khugepaged_scan.address > hend)
- goto skip;
+ if (khugepaged_scan.address > hend) {
+ progress++;
+ continue;
+ }
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
@@ -2476,7 +2479,7 @@ skip:
mmap_read_lock(mm);
if (hpage_collapse_test_exit_or_disable(mm))
goto breakouterloop;
- *result = collapse_pte_mapped_thp(mm,
+ *result = try_collapse_pte_mapped_thp(mm,
khugepaged_scan.address, false);
if (*result == SCAN_PMD_MAPPED)
*result = SCAN_SUCCEED;
@@ -2552,7 +2555,7 @@ static void khugepaged_do_scan(struct collapse_control *cc)
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
- int result = SCAN_SUCCEED;
+ enum scan_result result = SCAN_SUCCEED;
lru_add_drain_all();
@@ -2747,6 +2750,7 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
case SCAN_PAGE_FILLED:
+ case SCAN_PAGE_DIRTY_OR_WRITEBACK:
return -EAGAIN;
/*
* Other: Trying again likely not to succeed / error intrinsic to
@@ -2764,7 +2768,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
struct collapse_control *cc;
struct mm_struct *mm = vma->vm_mm;
unsigned long hstart, hend, addr;
- int thps = 0, last_fail = SCAN_FAIL;
+ enum scan_result last_fail = SCAN_FAIL;
+ int thps = 0;
bool mmap_locked = true;
BUG_ON(vma->vm_start > start);
@@ -2785,8 +2790,10 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
hend = end & HPAGE_PMD_MASK;
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
- int result = SCAN_FAIL;
+ enum scan_result result = SCAN_FAIL;
+ bool triggered_wb = false;
+retry:
if (!mmap_locked) {
cond_resched();
mmap_read_lock(mm);
@@ -2807,8 +2814,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_unlock(mm);
mmap_locked = false;
+ *lock_dropped = true;
result = hpage_collapse_scan_file(mm, addr, file, pgoff,
cc);
+
+ if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb &&
+ mapping_can_writeback(file->f_mapping)) {
+ loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ filemap_write_and_wait_range(file->f_mapping, lstart, lend);
+ triggered_wb = true;
+ fput(file);
+ goto retry;
+ }
fput(file);
} else {
result = hpage_collapse_scan_pmd(mm, vma, addr,
@@ -2826,7 +2845,7 @@ handle_result:
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
mmap_read_lock(mm);
- result = collapse_pte_mapped_thp(mm, addr, true);
+ result = try_collapse_pte_mapped_thp(mm, addr, true);
mmap_read_unlock(mm);
goto handle_result;
/* Whitelisted set of results where continuing OK */
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
index 902ec48b1e3e..81e642db6e23 100644
--- a/mm/kmsan/kmsan_test.c
+++ b/mm/kmsan/kmsan_test.c
@@ -361,7 +361,7 @@ static void test_init_vmalloc(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
-/* Test case: ensure that use-after-free reporting works. */
+/* Test case: ensure that use-after-free reporting works for kmalloc. */
static void test_uaf(struct kunit *test)
{
EXPECTATION_USE_AFTER_FREE(expect);
@@ -378,6 +378,65 @@ static void test_uaf(struct kunit *test)
KUNIT_EXPECT_TRUE(test, report_matches(&expect));
}
+static void test_uninit_page(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ struct page *page;
+ int *ptr;
+
+ kunit_info(test, "uninitialized page allocation (UMR report)\n");
+ page = alloc_pages(GFP_KERNEL, 0);
+ ptr = page_address(page);
+ USE(*ptr);
+ __free_pages(page, 0);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static volatile char *test_uaf_pages_helper(int order, int offset)
+{
+ struct page *page;
+ volatile char *var;
+
+ /* Memory is initialized up until __free_pages() thanks to __GFP_ZERO. */
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+ var = page_address(page) + offset;
+ __free_pages(page, order);
+
+ return var;
+}
+
+/* Test case: ensure that use-after-free reporting works for a freed page. */
+static void test_uaf_pages(struct kunit *test)
+{
+ EXPECTATION_USE_AFTER_FREE(expect);
+ volatile char value;
+
+ kunit_info(test, "use-after-free on a freed page (UMR report)\n");
+ /* Allocate a single page, free it, then try to access it. */
+ value = *test_uaf_pages_helper(0, 3);
+ USE(value);
+
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that UAF reporting works for high order pages. */
+static void test_uaf_high_order_pages(struct kunit *test)
+{
+ EXPECTATION_USE_AFTER_FREE(expect);
+ volatile char value;
+
+ kunit_info(test,
+ "use-after-free on a freed high-order page (UMR report)\n");
+ /*
+ * Create a high-order non-compound page, free it, then try to access
+ * its tail page.
+ */
+ value = *test_uaf_pages_helper(1, PAGE_SIZE + 3);
+ USE(value);
+
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
/*
* Test case: ensure that uninitialized values are propagated through per-CPU
* memory.
@@ -682,7 +741,10 @@ static struct kunit_case kmsan_test_cases[] = {
KUNIT_CASE(test_uninit_kmsan_check_memory),
KUNIT_CASE(test_init_kmsan_vmap_vunmap),
KUNIT_CASE(test_init_vmalloc),
+ KUNIT_CASE(test_uninit_page),
KUNIT_CASE(test_uaf),
+ KUNIT_CASE(test_uaf_pages),
+ KUNIT_CASE(test_uaf_high_order_pages),
KUNIT_CASE(test_percpu_propagate),
KUNIT_CASE(test_printk),
KUNIT_CASE(test_init_memcpy),
diff --git a/mm/list_lru.c b/mm/list_lru.c
index ec48b5dadf51..13b9f66d950e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -187,7 +187,7 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
if (list_lru_memcg_aware(lru)) {
rcu_read_lock();
- ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item));
+ ret = list_lru_add(lru, item, nid, mem_cgroup_from_virt(item));
rcu_read_unlock();
} else {
ret = list_lru_add(lru, item, nid, NULL);
@@ -224,7 +224,7 @@ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
if (list_lru_memcg_aware(lru)) {
rcu_read_lock();
- ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item));
+ ret = list_lru_del(lru, item, nid, mem_cgroup_from_virt(item));
rcu_read_unlock();
} else {
ret = list_lru_del(lru, item, nid, NULL);
@@ -369,7 +369,7 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
xa_for_each(&lru->xa, index, mlru) {
rcu_read_lock();
- memcg = mem_cgroup_from_id(index);
+ memcg = mem_cgroup_from_private_id(index);
if (!mem_cgroup_tryget(memcg)) {
rcu_read_unlock();
continue;
diff --git a/mm/madvise.c b/mm/madvise.c
index b617b1be0f53..1f3040688f04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -109,9 +109,7 @@ void anon_vma_name_free(struct kref *kref)
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
- vma_assert_locked(vma);
-
+ vma_assert_stabilised(vma);
return vma->anon_name;
}
@@ -453,7 +451,7 @@ restart:
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
nr = 1;
ptent = ptep_get(pte);
@@ -461,7 +459,7 @@ restart:
if (++batch_count == SWAP_CLUSTER_MAX) {
batch_count = 0;
if (need_resched()) {
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
cond_resched();
goto restart;
@@ -497,7 +495,7 @@ restart:
if (!folio_trylock(folio))
continue;
folio_get(folio);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
@@ -508,7 +506,7 @@ restart:
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (!err)
nr = 0;
continue;
@@ -556,7 +554,7 @@ restart:
}
if (start_pte) {
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
}
if (pageout)
@@ -675,7 +673,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
nr = 1;
ptent = ptep_get(pte);
@@ -694,7 +692,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
max_nr = (end - addr) / PAGE_SIZE;
nr = swap_pte_batch(pte, max_nr, ptent);
nr_swap -= nr;
- free_swap_and_cache_nr(entry, nr);
+ swap_put_entries_direct(entry, nr);
clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (softleaf_is_hwpoison(entry) ||
softleaf_is_poison_marker(entry)) {
@@ -724,7 +722,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!folio_trylock(folio))
continue;
folio_get(folio);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
@@ -735,7 +733,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (!err)
nr = 0;
continue;
@@ -775,7 +773,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (nr_swap)
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
if (start_pte) {
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(start_pte, ptl);
}
cond_resched();
@@ -1867,7 +1865,7 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
* madvise_should_skip() - Return if the request is invalid or nothing.
* @start: Start address of madvise-requested address range.
* @len_in: Length of madvise-requested address range.
- * @behavior: Requested madvise behavor.
+ * @behavior: Requested madvise behavior.
* @err: Pointer to store an error code from the check.
*
* If the specified behaviour is invalid or nothing would occur, we skip the
diff --git a/mm/memblock.c b/mm/memblock.c
index 905d06b16348..e76255e4ff36 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -773,7 +773,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt
unsigned long start_pfn, end_pfn, mem_size_mb;
int nid, i;
- /* calculate lose page */
+ /* calculate lost page */
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
if (!numa_valid_node(nid))
nr_pages += end_pfn - start_pfn;
@@ -2414,7 +2414,7 @@ EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
/**
* reserve_mem_release_by_name - Release reserved memory region with a given name
- * @name: The name that is attatched to a reserved memory region
+ * @name: The name that is attached to a reserved memory region
*
* Forcibly release the pages in the reserved memory region so that those memory
* can be used as free memory. After released the reserved region size becomes 0.
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 6eed14bff742..0e3d972fad33 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -427,6 +427,28 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
}
#endif
+static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+ unsigned long val;
+
+ if (mem_cgroup_is_root(memcg)) {
+ /*
+ * Approximate root's usage from global state. This isn't
+ * perfect, but the root usage was always an approximation.
+ */
+ val = global_node_page_state(NR_FILE_PAGES) +
+ global_node_page_state(NR_ANON_MAPPED);
+ if (swap)
+ val += total_swap_pages - get_nr_swap_pages();
+ } else {
+ if (!swap)
+ val = page_counter_read(&memcg->memory);
+ else
+ val = page_counter_read(&memcg->memsw);
+ }
+ return val;
+}
+
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
@@ -613,14 +635,14 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
* have an ID allocated to it anymore, charge the closest online
* ancestor for the swap instead and transfer the memory+swap charge.
*/
- swap_memcg = mem_cgroup_id_get_online(memcg);
+ swap_memcg = mem_cgroup_private_id_get_online(memcg);
nr_entries = folio_nr_pages(folio);
/* Get references for the tail pages, too */
if (nr_entries > 1)
- mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
+ swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index a304ad418cdf..eb3c3c105657 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -22,15 +22,13 @@
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
-
void drain_all_stock(struct mem_cgroup *root_memcg);
unsigned long memcg_events(struct mem_cgroup *memcg, int event);
int memory_stat_show(struct seq_file *m, void *v);
-void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n);
-struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg);
+void mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n);
+struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg);
/* Cgroup v1-specific declarations */
#ifdef CONFIG_MEMCG_V1
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36ab9897b61b..b730233a481d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -646,7 +646,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* in latency-sensitive paths is as cheap as possible.
*/
__mem_cgroup_flush_stats(root_mem_cgroup, true);
- queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+ queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME);
}
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
@@ -816,7 +816,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
struct lruvec *lruvec;
rcu_read_lock();
- memcg = mem_cgroup_from_slab_obj(p);
+ memcg = mem_cgroup_from_virt(p);
/*
* Untracked pages have no memcg, no lruvec. Update only the
@@ -1639,11 +1639,6 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
return max;
}
-unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
-{
- return page_counter_read(&memcg->memory);
-}
-
void __memcg_memory_event(struct mem_cgroup *memcg,
enum memcg_memory_event event, bool allow_spinning)
{
@@ -2658,7 +2653,7 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p)
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
* cgroup_mutex, etc.
*/
-struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+struct mem_cgroup *mem_cgroup_from_virt(void *p)
{
struct slab *slab;
@@ -3320,28 +3315,6 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
css_get_many(&__folio_memcg(folio)->css, new_refs);
}
-unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
-{
- unsigned long val;
-
- if (mem_cgroup_is_root(memcg)) {
- /*
- * Approximate root's usage from global state. This isn't
- * perfect, but the root usage was always an approximation.
- */
- val = global_node_page_state(NR_FILE_PAGES) +
- global_node_page_state(NR_ANON_MAPPED);
- if (swap)
- val += total_swap_pages - get_nr_swap_pages();
- } else {
- if (!swap)
- val = page_counter_read(&memcg->memory);
- else
- val = page_counter_read(&memcg->memsw);
- }
- return val;
-}
-
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg;
@@ -3629,38 +3602,38 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
*/
#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
-static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids);
+static DEFINE_XARRAY_ALLOC1(mem_cgroup_private_ids);
-static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
+static void mem_cgroup_private_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
- xa_erase(&mem_cgroup_ids, memcg->id.id);
+ xa_erase(&mem_cgroup_private_ids, memcg->id.id);
memcg->id.id = 0;
}
}
-void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
+void __maybe_unused mem_cgroup_private_id_get_many(struct mem_cgroup *memcg,
unsigned int n)
{
refcount_add(n, &memcg->id.ref);
}
-static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
+static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
if (refcount_sub_and_test(n, &memcg->id.ref)) {
- mem_cgroup_id_remove(memcg);
+ mem_cgroup_private_id_remove(memcg);
/* Memcg ID pins CSS */
css_put(&memcg->css);
}
}
-static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
+static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg)
{
- mem_cgroup_id_put_many(memcg, 1);
+ mem_cgroup_private_id_put_many(memcg, 1);
}
-struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg)
{
while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
@@ -3679,39 +3652,35 @@ struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
}
/**
- * mem_cgroup_from_id - look up a memcg from a memcg id
+ * mem_cgroup_from_private_id - look up a memcg from a memcg id
* @id: the memcg id to look up
*
* Caller must hold rcu_read_lock().
*/
-struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return xa_load(&mem_cgroup_ids, id);
+ return xa_load(&mem_cgroup_private_ids, id);
}
-#ifdef CONFIG_SHRINKER_DEBUG
-struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+struct mem_cgroup *mem_cgroup_get_from_id(u64 id)
{
struct cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
- cgrp = cgroup_get_from_id(ino);
+ cgrp = cgroup_get_from_id(id);
if (IS_ERR(cgrp))
- return ERR_CAST(cgrp);
+ return NULL;
css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
if (css)
memcg = container_of(css, struct mem_cgroup, css);
- else
- memcg = ERR_PTR(-ENOENT);
cgroup_put(cgrp);
return memcg;
}
-#endif
static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn)
{
@@ -3786,7 +3755,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg)
return ERR_PTR(-ENOMEM);
- error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL,
+ error = xa_alloc(&mem_cgroup_private_ids, &memcg->id.id, NULL,
XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL);
if (error)
goto fail;
@@ -3846,7 +3815,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
lru_gen_init_memcg(memcg);
return memcg;
fail:
- mem_cgroup_id_remove(memcg);
+ mem_cgroup_private_id_remove(memcg);
__mem_cgroup_free(memcg);
return ERR_PTR(error);
}
@@ -3920,7 +3889,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
goto offline_kmem;
if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
- queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ queue_delayed_work(system_dfl_wq, &stats_flush_dwork,
FLUSH_TIME);
lru_gen_online_memcg(memcg);
@@ -3929,7 +3898,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
css_get(css);
/*
- * Ensure mem_cgroup_from_id() works once we're fully online.
+ * Ensure mem_cgroup_from_private_id() works once we're fully online.
*
* We could do this earlier and require callers to filter with
* css_tryget_online(). But right now there are no users that
@@ -3938,13 +3907,13 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
* publish it here at the end of onlining. This matches the
* regular ID destruction during offlining.
*/
- xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL);
+ xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
return 0;
offline_kmem:
memcg_offline_kmem(memcg);
remove_id:
- mem_cgroup_id_remove(memcg);
+ mem_cgroup_private_id_remove(memcg);
return -ENOMEM;
}
@@ -3967,7 +3936,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
drain_all_stock(memcg);
- mem_cgroup_id_put(memcg);
+ mem_cgroup_private_id_put(memcg);
}
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
@@ -4854,7 +4823,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
id = lookup_swap_cgroup_id(entry);
rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
+ memcg = mem_cgroup_from_private_id(id);
if (!memcg || !css_tryget_online(&memcg->css))
memcg = get_mem_cgroup_from_mm(mm);
rcu_read_unlock();
@@ -5051,7 +5020,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
memcg = folio_memcg(old);
/*
* Note that it is normal to see !memcg for a hugetlb folio.
- * For e.g, itt could have been allocated when memory_hugetlb_accounting
+ * For e.g, it could have been allocated when memory_hugetlb_accounting
* was not selected.
*/
VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
@@ -5257,22 +5226,22 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
return 0;
}
- memcg = mem_cgroup_id_get_online(memcg);
+ memcg = mem_cgroup_private_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
- mem_cgroup_id_put(memcg);
+ mem_cgroup_private_id_put(memcg);
return -ENOMEM;
}
/* Get references for the tail pages, too */
if (nr_pages > 1)
- mem_cgroup_id_get_many(memcg, nr_pages - 1);
+ mem_cgroup_private_id_get_many(memcg, nr_pages - 1);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+ swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry);
return 0;
}
@@ -5289,7 +5258,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
+ memcg = mem_cgroup_from_private_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg)) {
if (do_memsw_account())
@@ -5298,7 +5267,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
page_counter_uncharge(&memcg->swap, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
- mem_cgroup_id_put_many(memcg, nr_pages);
+ mem_cgroup_private_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
}
diff --git a/mm/memfd.c b/mm/memfd.c
index f032c6052926..82a3f38aa30a 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -1,10 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* memfd_create system call and file sealing support
*
* Code was originally included in shmem.c, and broken out to facilitate
* use by hugetlbfs as well as tmpfs.
- *
- * This file is released under the GPL.
*/
#include <linux/fs.h>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9fd8355176eb..ba4231858a36 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -868,7 +868,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
*
* MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed.
* The page has been completely isolated, that is, unmapped, taken out of
- * the buddy system, or hole-punnched out of the file mapping.
+ * the buddy system, or hole-punched out of the file mapping.
*/
static const char *action_name[] = {
[MF_IGNORED] = "Ignored",
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 864811fff409..0ae8bec86346 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -475,8 +475,7 @@ static void establish_demotion_targets(void)
*/
list_for_each_entry_reverse(memtier, &memory_tiers, list) {
tier_nodes = get_memtier_nodemask(memtier);
- nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
- if (!nodes_empty(tier_nodes)) {
+ if (nodes_and(tier_nodes, node_states[N_CPU], tier_nodes)) {
/*
* abstract distance below the max value of this memtier
* is considered toptier.
@@ -648,7 +647,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
if (node_memory_types[node].memtype == memtype || !memtype)
node_memory_types[node].map_count--;
/*
- * If we umapped all the attached devices to this node,
+ * If we unmapped all the attached devices to this node,
* clear the node memory type.
*/
if (!node_memory_types[node].map_count) {
@@ -956,7 +955,7 @@ static ssize_t demotion_enabled_store(struct kobject *kobj,
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat)
- atomic_set(&pgdat->kswapd_failures, 0);
+ kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER);
}
return count;
diff --git a/mm/memory.c b/mm/memory.c
index b2909c94e249..b0d487229b2e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *page;
if (likely(softleaf_is_swap(entry))) {
- if (swap_duplicate(entry) < 0)
+ if (swap_dup_entry_direct(entry) < 0)
return -EIO;
/* make sure dst_mm is on swapoff's mmlist. */
@@ -1256,7 +1256,7 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
nr = 1;
@@ -1325,7 +1325,7 @@ again:
} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
@@ -1748,7 +1748,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
nr = swap_pte_batch(pte, max_nr, ptent);
rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
+ swap_put_entries_direct(entry, nr);
} else if (softleaf_is_migration(entry)) {
struct folio *folio = softleaf_to_folio(entry);
@@ -1821,11 +1821,70 @@ static inline int do_zap_pte_range(struct mmu_gather *tlb,
return nr;
}
+static bool pte_table_reclaim_possible(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ if (!IS_ENABLED(CONFIG_PT_RECLAIM))
+ return false;
+ /* Only zap if we are allowed to and cover the full page table. */
+ return details && details->reclaim_pt && (end - start >= PMD_SIZE);
+}
+
+static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd,
+ spinlock_t *ptl, pmd_t *pmdval)
+{
+ spinlock_t *pml = pmd_lockptr(mm, pmd);
+
+ if (ptl != pml && !spin_trylock(pml))
+ return false;
+
+ *pmdval = pmdp_get(pmd);
+ pmd_clear(pmd);
+ if (ptl != pml)
+ spin_unlock(pml);
+ return true;
+}
+
+static bool zap_pte_table_if_empty(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, pmd_t *pmdval)
+{
+ spinlock_t *pml, *ptl = NULL;
+ pte_t *start_pte, *pte;
+ int i;
+
+ pml = pmd_lock(mm, pmd);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, pmdval, &ptl);
+ if (!start_pte)
+ goto out_ptl;
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+ return true;
+out_ptl:
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (ptl != pml)
+ spin_unlock(pml);
+ return false;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ bool can_reclaim_pt = pte_table_reclaim_possible(addr, end, details);
bool force_flush = false, force_break = false;
struct mm_struct *mm = tlb->mm;
int rss[NR_MM_COUNTERS];
@@ -1834,7 +1893,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *pte;
pmd_t pmdval;
unsigned long start = addr;
- bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
bool direct_reclaim = true;
int nr;
@@ -1846,7 +1904,7 @@ retry:
return addr;
flush_tlb_batched_pending(mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
bool any_skipped = false;
@@ -1875,10 +1933,10 @@ retry:
* from being repopulated by another thread.
*/
if (can_reclaim_pt && direct_reclaim && addr == end)
- direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+ direct_reclaim = zap_empty_pte_table(mm, pmd, ptl, &pmdval);
add_mm_rss_vec(mm, rss);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
/* Do the actual TLB flush before dropping ptl */
if (force_flush) {
@@ -1904,10 +1962,10 @@ retry:
}
if (can_reclaim_pt) {
- if (direct_reclaim)
- free_pte(mm, start, tlb, pmdval);
- else
- try_to_free_pte(mm, pmd, start, tlb);
+ if (direct_reclaim || zap_pte_table_if_empty(mm, pmd, start, &pmdval)) {
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+ }
}
return addr;
@@ -2499,7 +2557,6 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
{
unsigned long count = vma_pages(vma);
unsigned long uaddr = vma->vm_start;
- int ret, i;
/* Fail if the user requested offset is beyond the end of the object */
if (offset >= num)
@@ -2509,14 +2566,7 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
if (count > num - offset)
return -ENXIO;
- for (i = 0; i < count; i++) {
- ret = vm_insert_page(vma, uaddr, pages[offset + i]);
- if (ret < 0)
- return ret;
- uaddr += PAGE_SIZE;
- }
-
- return 0;
+ return vm_insert_pages(vma, uaddr, pages + offset, &count);
}
/**
@@ -2816,7 +2866,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
@@ -2826,7 +2876,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -3177,7 +3227,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -EINVAL;
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
if (fn) {
do {
@@ -3190,7 +3240,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
}
*mask |= PGTBL_PTE_MODIFIED;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (mm != &init_mm)
pte_unmap_unlock(mapped_pte, ptl);
@@ -4357,12 +4407,27 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
-static inline bool should_try_to_free_swap(struct folio *folio,
+/*
+ * Check if we should call folio_free_swap to free the swap cache.
+ * folio_free_swap only frees the swap cache to release the slot if swap
+ * count is zero, so we don't need to check the swap count here.
+ */
+static inline bool should_try_to_free_swap(struct swap_info_struct *si,
+ struct folio *folio,
struct vm_area_struct *vma,
+ unsigned int extra_refs,
unsigned int fault_flags)
{
if (!folio_test_swapcache(folio))
return false;
+ /*
+ * Always try to free swap cache for SWP_SYNCHRONOUS_IO devices. Swap
+ * cache can help save some IO or memory overhead, but these devices
+ * are fast, and meanwhile, swap cache pinning the slot deferring the
+ * release of metadata or fragmentation is a more critical issue.
+ */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
+ return true;
if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
folio_test_mlocked(folio))
return true;
@@ -4373,7 +4438,7 @@ static inline bool should_try_to_free_swap(struct folio *folio,
* reference only in case it's likely that we'll be the exclusive user.
*/
return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
- folio_ref_count(folio) == (1 + folio_nr_pages(folio));
+ folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio));
}
static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -4611,7 +4676,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+/* Sanity check that a folio is fully exclusive */
+static void check_swap_exclusive(struct folio *folio, swp_entry_t entry,
+ unsigned int nr_pages)
+{
+ /* Called under PT locked and folio locked, the swap count is stable */
+ do {
+ VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio);
+ entry.val++;
+ } while (--nr_pages);
+}
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
@@ -4624,17 +4698,14 @@ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct folio *swapcache, *folio = NULL;
- DECLARE_WAITQUEUE(wait, current);
+ struct folio *swapcache = NULL, *folio;
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
- bool need_clear_cache = false;
bool exclusive = false;
softleaf_t entry;
pte_t pte;
vm_fault_t ret = 0;
- void *shadow = NULL;
int nr_pages;
unsigned long page_idx;
unsigned long address;
@@ -4705,57 +4776,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio = swap_cache_get_folio(entry);
if (folio)
swap_update_readahead(folio, vma, vmf->address);
- swapcache = folio;
-
if (!folio) {
- if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
- __swap_count(entry) == 1) {
- /* skip swapcache */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
folio = alloc_swap_folio(vmf);
if (folio) {
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- nr_pages = folio_nr_pages(folio);
- if (folio_test_large(folio))
- entry.val = ALIGN_DOWN(entry.val, nr_pages);
/*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread
- * may finish swapin first, free the entry, and
- * swapout reusing the same entry. It's
- * undetectable as pte_same() returns true due
- * to entry reuse.
+ * folio is charged, so swapin can only fail due
+ * to raced swapin and return NULL.
*/
- if (swapcache_prepare(entry, nr_pages)) {
- /*
- * Relax a bit to prevent rapid
- * repeated page faults.
- */
- add_wait_queue(&swapcache_wq, &wait);
- schedule_timeout_uninterruptible(1);
- remove_wait_queue(&swapcache_wq, &wait);
- goto out_page;
- }
- need_clear_cache = true;
-
- memcg1_swapin(entry, nr_pages);
-
- shadow = swap_cache_get_shadow(entry);
- if (shadow)
- workingset_refault(folio, shadow);
-
- folio_add_lru(folio);
-
- /* To provide entry to swap_read_folio() */
- folio->swap = entry;
- swap_read_folio(folio, NULL);
- folio->private = NULL;
+ swapcache = swapin_folio(entry, folio);
+ if (swapcache != folio)
+ folio_put(folio);
+ folio = swapcache;
}
} else {
- folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
- vmf);
- swapcache = folio;
+ folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
}
if (!folio) {
@@ -4777,60 +4812,58 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
}
+ swapcache = folio;
ret |= folio_lock_or_retry(folio, vmf);
if (ret & VM_FAULT_RETRY)
goto out_release;
page = folio_file_page(folio, swp_offset(entry));
- if (swapcache) {
- /*
- * Make sure folio_free_swap() or swapoff did not release the
- * swapcache from under us. The page pin, and pte_same test
- * below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not
- * changed.
- */
- if (unlikely(!folio_matches_swap_entry(folio, entry)))
- goto out_page;
-
- if (unlikely(PageHWPoison(page))) {
- /*
- * hwpoisoned dirty swapcache pages are kept for killing
- * owner processes (which may be unknown at hwpoison time)
- */
- ret = VM_FAULT_HWPOISON;
- goto out_page;
- }
-
- /*
- * KSM sometimes has to copy on read faults, for example, if
- * folio->index of non-ksm folios would be nonlinear inside the
- * anon VMA -- the ksm flag is lost on actual swapout.
- */
- folio = ksm_might_need_to_copy(folio, vma, vmf->address);
- if (unlikely(!folio)) {
- ret = VM_FAULT_OOM;
- folio = swapcache;
- goto out_page;
- } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
- ret = VM_FAULT_HWPOISON;
- folio = swapcache;
- goto out_page;
- }
- if (folio != swapcache)
- page = folio_page(folio, 0);
+ /*
+ * Make sure folio_free_swap() or swapoff did not release the
+ * swapcache from under us. The page pin, and pte_same test
+ * below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not
+ * changed.
+ */
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
+ goto out_page;
+ if (unlikely(PageHWPoison(page))) {
/*
- * If we want to map a page that's in the swapcache writable, we
- * have to detect via the refcount if we're really the exclusive
- * owner. Try removing the extra reference from the local LRU
- * caches if required.
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
- !folio_test_ksm(folio) && !folio_test_lru(folio))
- lru_add_drain();
+ ret = VM_FAULT_HWPOISON;
+ goto out_page;
}
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
+ */
+ folio = ksm_might_need_to_copy(folio, vma, vmf->address);
+ if (unlikely(!folio)) {
+ ret = VM_FAULT_OOM;
+ folio = swapcache;
+ goto out_page;
+ } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
+ ret = VM_FAULT_HWPOISON;
+ folio = swapcache;
+ goto out_page;
+ } else if (folio != swapcache)
+ page = folio_page(folio, 0);
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * caches if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !folio_test_ksm(folio) && !folio_test_lru(folio))
+ lru_add_drain();
+
folio_throttle_swaprate(folio, GFP_KERNEL);
/*
@@ -4846,24 +4879,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_nomap;
}
- /* allocated large folios for SWP_SYNCHRONOUS_IO */
- if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
- unsigned long nr = folio_nr_pages(folio);
- unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
- unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
- pte_t *folio_ptep = vmf->pte - idx;
- pte_t folio_pte = ptep_get(folio_ptep);
-
- if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
- swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
- goto out_nomap;
-
- page_idx = idx;
- address = folio_start;
- ptep = folio_ptep;
- goto check_folio;
- }
-
nr_pages = 1;
page_idx = 0;
address = vmf->address;
@@ -4908,11 +4923,36 @@ check_folio:
BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
/*
+ * If a large folio already belongs to anon mapping, then we
+ * can just go on and map it partially.
+ * If not, with the large swapin check above failing, the page table
+ * have changed, so sub pages might got charged to the wrong cgroup,
+ * or even should be shmem. So we have to free it and fallback.
+ * Nothing should have touched it, both anon and shmem checks if a
+ * large folio is fully appliable before use.
+ *
+ * This will be removed once we unify folio allocation in the swap cache
+ * layer, where allocation of a folio stabilizes the swap entries.
+ */
+ if (!folio_test_anon(folio) && folio_test_large(folio) &&
+ nr_pages != folio_nr_pages(folio)) {
+ if (!WARN_ON_ONCE(folio_test_dirty(folio)))
+ swap_cache_del_folio(folio);
+ goto out_nomap;
+ }
+
+ /*
* Check under PT lock (to protect against concurrent fork() sharing
* the swap entry concurrently) for certainly exclusive pages.
*/
if (!folio_test_ksm(folio)) {
+ /*
+ * The can_swapin_thp check above ensures all PTE have
+ * same exclusiveness. Checking just one PTE is fine.
+ */
exclusive = pte_swp_exclusive(vmf->orig_pte);
+ if (exclusive)
+ check_swap_exclusive(folio, entry, nr_pages);
if (folio != swapcache) {
/*
* We have a fresh page that is not exposed to the
@@ -4946,19 +4986,10 @@ check_folio:
/*
* Some architectures may have to restore extra metadata to the page
* when reading from swap. This metadata may be indexed by swap entry
- * so this must be called before swap_free().
+ * so this must be called before folio_put_swap().
*/
arch_swap_restore(folio_swap(entry, folio), folio);
- /*
- * Remove the swap entry and conditionally try to free up the swapcache.
- * We're already holding a reference on the page but haven't mapped it
- * yet.
- */
- swap_free_nr(entry, nr_pages);
- if (should_try_to_free_swap(folio, vma, vmf->flags))
- folio_free_swap(folio);
-
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
pte = mk_pte(page, vma->vm_page_prot);
@@ -4990,22 +5021,24 @@ check_folio:
vmf->orig_pte = pte_advance_pfn(pte, page_idx);
/* ksm created a completely new copy */
- if (unlikely(folio != swapcache && swapcache)) {
+ if (unlikely(folio != swapcache)) {
folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
+ folio_put_swap(swapcache, NULL);
} else if (!folio_test_anon(folio)) {
/*
- * We currently only expect small !anon folios which are either
- * fully exclusive or fully shared, or new allocated large
- * folios which are fully exclusive. If we ever get large
- * folios within swapcache here, we have to be careful.
+ * We currently only expect !anon folios that are fully
+ * mappable. See the comment after can_swapin_thp above.
*/
- VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
- VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
+ folio_put_swap(folio, NULL);
} else {
+ VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio));
folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
- rmap_flags);
+ rmap_flags);
+ folio_put_swap(folio, nr_pages == 1 ? page : NULL);
}
VM_BUG_ON(!folio_test_anon(folio) ||
@@ -5014,13 +5047,21 @@ check_folio:
arch_do_swap_page_nr(vma->vm_mm, vma, address,
pte, pte, nr_pages);
+ /*
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * Do it after mapping, so raced page faults will likely see the folio
+ * in swap cache and wait on the folio lock.
+ */
+ if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags))
+ folio_free_swap(folio);
+
folio_unlock(folio);
- if (folio != swapcache && swapcache) {
+ if (unlikely(folio != swapcache)) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
* (to avoid false positives from pte_same). For
- * further safety release the lock after the swap_free
+ * further safety release the lock after the folio_put_swap
* so that the swap count won't change under a
* parallel locked swapcache.
*/
@@ -5041,12 +5082,6 @@ unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- /* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache) {
- swapcache_clear(si, entry, nr_pages);
- if (waitqueue_active(&swapcache_wq))
- wake_up(&swapcache_wq);
- }
if (si)
put_swap_device(si);
return ret;
@@ -5054,18 +5089,15 @@ out_nomap:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
folio_unlock(folio);
out_release:
folio_put(folio);
- if (folio != swapcache && swapcache) {
+ if (folio != swapcache) {
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache) {
- swapcache_clear(si, entry, nr_pages);
- if (waitqueue_active(&swapcache_wq))
- wake_up(&swapcache_wq);
- }
if (si)
put_swap_device(si);
return ret;
@@ -5935,7 +5967,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
else
*last_cpupid = folio_last_cpupid(folio);
- /* Record the current PID acceesing VMA */
+ /* Record the current PID accessing VMA */
vma_set_access_pid_bit(vma);
count_vm_numa_event(NUMA_HINT_FAULTS);
@@ -6254,7 +6286,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* Use the maywrite version to indicate that vmf->pte may be
* modified, but since we will use pte_same() to detect the
* change of the !pte_none() entry, there is no need to recheck
- * the pmdval. Here we chooes to pass a dummy variable instead
+ * the pmdval. Here we choose to pass a dummy variable instead
* of NULL, which helps new user think about why this place is
* special.
*/
@@ -7240,40 +7272,77 @@ static inline int process_huge_page(
return 0;
}
-static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
- unsigned int nr_pages)
+static void clear_contig_highpages(struct page *page, unsigned long addr,
+ unsigned int nr_pages)
{
- unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
- int i;
+ unsigned int i, count;
+ /*
+ * When clearing we want to operate on the largest extent possible to
+ * allow for architecture specific extent based optimizations.
+ *
+ * However, since clear_user_highpages() (and primitives clear_user_pages(),
+ * clear_pages()), do not call cond_resched(), limit the unit size when
+ * running under non-preemptible scheduling models.
+ */
+ const unsigned int unit = preempt_model_preemptible() ?
+ nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH;
might_sleep();
- for (i = 0; i < nr_pages; i++) {
+
+ for (i = 0; i < nr_pages; i += count) {
cond_resched();
- clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE);
+
+ count = min(unit, nr_pages - i);
+ clear_user_highpages(page + i, addr + i * PAGE_SIZE, count);
}
}
-static int clear_subpage(unsigned long addr, int idx, void *arg)
-{
- struct folio *folio = arg;
-
- clear_user_highpage(folio_page(folio, idx), addr);
- return 0;
-}
+/*
+ * When zeroing a folio, we want to differentiate between pages in the
+ * vicinity of the faulting address where we have spatial and temporal
+ * locality, and those far away where we don't.
+ *
+ * Use a radius of 2 for determining the local neighbourhood.
+ */
+#define FOLIO_ZERO_LOCALITY_RADIUS 2
/**
* folio_zero_user - Zero a folio which will be mapped to userspace.
* @folio: The folio to zero.
- * @addr_hint: The address will be accessed or the base address if uncelar.
+ * @addr_hint: The address accessed by the user or the base address.
*/
void folio_zero_user(struct folio *folio, unsigned long addr_hint)
{
- unsigned int nr_pages = folio_nr_pages(folio);
+ const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio));
+ const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE;
+ const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1);
+ const int radius = FOLIO_ZERO_LOCALITY_RADIUS;
+ struct range r[3];
+ int i;
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
- clear_gigantic_page(folio, addr_hint, nr_pages);
- else
- process_huge_page(addr_hint, nr_pages, clear_subpage, folio);
+ /*
+ * Faulting page and its immediate neighbourhood. Will be cleared at the
+ * end to keep its cachelines hot.
+ */
+ r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end),
+ clamp_t(s64, fault_idx + radius, pg.start, pg.end));
+
+ /* Region to the left of the fault */
+ r[1] = DEFINE_RANGE(pg.start,
+ clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start));
+
+ /* Region to the right of the fault: always valid for the common fault_idx=0 case. */
+ r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1),
+ pg.end);
+
+ for (i = 0; i < ARRAY_SIZE(r); i++) {
+ const unsigned long addr = base_addr + r[i].start * PAGE_SIZE;
+ const unsigned int nr_pages = range_len(&r[i]);
+ struct page *page = folio_page(folio, r[i].start);
+
+ if (nr_pages > 0)
+ clear_contig_highpages(page, addr, nr_pages);
+ }
}
static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a63ec679d861..bc805029da51 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -926,7 +926,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
*
* MOVABLE : KERNEL_EARLY
*
- * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
+ * Whereby KERNEL_EARLY is memory in one of the kernel zones, available since
* boot. We base our calculation on KERNEL_EARLY internally, because:
*
* a) Hotplugged memory in one of the kernel zones can sometimes still get
@@ -946,8 +946,8 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
* We rely on "present pages" instead of "managed pages", as the latter is
* highly unreliable and dynamic in virtualized environments, and does not
* consider boot time allocations. For example, memory ballooning adjusts the
- * managed pages when inflating/deflating the balloon, and balloon compaction
- * can even migrate inflated pages between zones.
+ * managed pages when inflating/deflating the balloon, and balloon page
+ * migration can even migrate inflated pages between zones.
*
* Using "present pages" is better but some things to keep in mind are:
*
@@ -1258,7 +1258,7 @@ static pg_data_t *hotadd_init_pgdat(int nid)
* NODE_DATA is preallocated (free_area_init) but its internal
* state is not allocated completely. Add missing pieces.
* Completely offline nodes stay around and they just need
- * reintialization.
+ * reinitialization.
*/
pgdat = NODE_DATA(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 68a98ba57882..dbd48502ac24 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -365,7 +365,7 @@ static const struct mempolicy_operations {
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
- return pol->flags & MPOL_MODE_FLAGS;
+ return pol->flags & MPOL_USER_NODEMASK_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
@@ -1909,8 +1909,7 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
}
task_nodes = cpuset_mems_allowed(current);
- nodes_and(*new, *new, task_nodes);
- if (nodes_empty(*new))
+ if (!nodes_and(*new, *new, task_nodes))
goto out_put;
err = security_task_movememory(task);
diff --git a/mm/migrate.c b/mm/migrate.c
index 4688b9e38cd2..1bf2cf8c44dd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -88,7 +88,7 @@ static const struct movable_operations *page_movable_ops(struct page *page)
* back to the buddy.
*/
if (PageOffline(page))
- /* Only balloon compaction sets PageOffline pages movable. */
+ /* Only balloon page migration sets PageOffline pages movable. */
return offline_movable_ops;
if (PageZsmalloc(page))
return zsmalloc_movable_ops;
@@ -452,11 +452,12 @@ static bool remove_migration_pte(struct folio *folio,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
+void remove_migration_ptes(struct folio *src, struct folio *dst,
+ enum ttu_flags flags)
{
struct rmap_walk_arg rmap_walk_arg = {
.folio = src,
- .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
+ .map_unused_to_zeropage = flags & TTU_USE_SHARED_ZEROPAGE,
};
struct rmap_walk_control rwc = {
@@ -464,9 +465,9 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
.arg = &rmap_walk_arg,
};
- VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+ VM_BUG_ON_FOLIO((flags & TTU_USE_SHARED_ZEROPAGE) && (src != dst), src);
- if (flags & RMP_LOCKED)
+ if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(dst, &rwc);
else
rmap_walk(dst, &rwc);
@@ -1521,8 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src, !rc ? dst : src,
- ttu ? RMP_LOCKED : 0);
+ remove_migration_ptes(src, !rc ? dst : src, ttu);
if (ttu & TTU_RMAP_LOCKED)
i_mmap_unlock_write(mapping);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 23379663b1e1..0a8b31939640 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -271,7 +271,7 @@ again:
ptep = pte_offset_map_lock(mm, pmdp, start, &ptl);
if (!ptep)
goto again;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
ptep += (addr - start) / PAGE_SIZE;
for (; addr < end; addr += PAGE_SIZE, ptep++) {
@@ -313,7 +313,7 @@ again:
if (folio_test_large(folio)) {
int ret;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(ptep, ptl);
ret = migrate_vma_split_folio(folio,
migrate->fault_page);
@@ -356,7 +356,7 @@ again:
if (folio && folio_test_large(folio)) {
int ret;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(ptep, ptl);
ret = migrate_vma_split_folio(folio,
migrate->fault_page);
@@ -485,7 +485,7 @@ next:
if (unmapped)
flush_tlb_range(walk->vma, start, end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(ptep - 1, ptl);
return 0;
@@ -1419,10 +1419,10 @@ EXPORT_SYMBOL(migrate_device_range);
/**
* migrate_device_pfns() - migrate device private pfns to normal memory.
- * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @src_pfns: pre-populated array of source device private pfns to migrate.
* @npages: number of pages to migrate.
*
- * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * Similar to migrate_device_range() but supports non-contiguous pre-populated
* array of device pages to migrate.
*/
int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2a809cd8e7fa..1a29a719af58 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -187,7 +187,7 @@ void mm_compute_batch(int overcommit_policy)
/*
* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
* (total memory/#cpus), and lift it to 25% for other policies
- * to easy the possible lock contention for percpu_counter
+ * to ease the possible lock contention for percpu_counter
* vm_committed_as, while the max limit is INT_MAX
*/
if (overcommit_policy == OVERCOMMIT_NEVER)
@@ -646,21 +646,18 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
return nid;
}
-int hashdist = HASHDIST_DEFAULT;
+bool hashdist = HASHDIST_DEFAULT;
static int __init set_hashdist(char *str)
{
- if (!str)
- return 0;
- hashdist = simple_strtoul(str, &str, 0);
- return 1;
+ return kstrtobool(str, &hashdist) == 0;
}
__setup("hashdist=", set_hashdist);
static inline void fixup_hashdist(void)
{
if (num_node_state(N_MEMORY) == 1)
- hashdist = 0;
+ hashdist = false;
}
#else
static inline void fixup_hashdist(void) {}
@@ -1748,7 +1745,7 @@ static void __init free_area_init_node(int nid)
lru_gen_init_pgdat(pgdat);
}
-/* Any regular or high memory on that node ? */
+/* Any regular or high memory on that node? */
static void __init check_for_memory(pg_data_t *pgdat)
{
enum zone_type zone_type;
@@ -1810,7 +1807,6 @@ static void __init set_high_memory(void)
/**
* free_area_init - Initialise all pg_data_t and zone data
- * @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by memblock_set_node(), the size of each
@@ -1821,17 +1817,15 @@ static void __init set_high_memory(void)
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
-void __init free_area_init(unsigned long *max_zone_pfn)
+static void __init free_area_init(void)
{
+ unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
unsigned long start_pfn, end_pfn;
int i, nid, zone;
bool descending;
- /* Record where the zone boundaries are */
- memset(arch_zone_lowest_possible_pfn, 0,
- sizeof(arch_zone_lowest_possible_pfn));
- memset(arch_zone_highest_possible_pfn, 0,
- sizeof(arch_zone_highest_possible_pfn));
+ arch_zone_limits_init(max_zone_pfn);
+ sparse_init();
start_pfn = PHYS_PFN(memblock_start_of_DRAM());
descending = arch_has_descending_max_zone_pfns();
@@ -2048,7 +2042,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
* Initialize and free pages.
*
* At this point reserved pages and struct pages that correspond to holes in
- * memblock.memory are already intialized so every free range has a valid
+ * memblock.memory are already initialized so every free range has a valid
* memory map around it.
* This ensures that access of pages that are ahead of the range being
* initialized (computing buddy page in __free_one_page()) always reads a valid
@@ -2681,13 +2675,20 @@ void __init __weak mem_init(void)
{
}
+void __init mm_core_init_early(void)
+{
+ hugetlb_cma_reserve();
+ hugetlb_bootmem_alloc();
+
+ free_area_init();
+}
+
/*
* Set up kernel memory allocators
*/
void __init mm_core_init(void)
{
arch_mm_preinit();
- hugetlb_bootmem_alloc();
/* Initializations relying on SMP setup */
BUILD_BUG_ON(MAX_ZONELISTS > 2);
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 7421b7ea8001..898c2ef1e958 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -45,65 +45,111 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#ifdef CONFIG_MMU
#ifdef CONFIG_PER_VMA_LOCK
+
+/* State shared across __vma_[start, end]_exclude_readers. */
+struct vma_exclude_readers_state {
+ /* Input parameters. */
+ struct vm_area_struct *vma;
+ int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */
+ bool detaching;
+
+ /* Output parameters. */
+ bool detached;
+ bool exclusive; /* Are we exclusively locked? */
+};
+
/*
- * __vma_enter_locked() returns 0 immediately if the vma is not
- * attached, otherwise it waits for any current readers to finish and
- * returns 1. Returns -EINTR if a signal is received while waiting.
+ * Now that all readers have been evicted, mark the VMA as being out of the
+ * 'exclude readers' state.
*/
-static inline int __vma_enter_locked(struct vm_area_struct *vma,
- bool detaching, int state)
+static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves)
{
- int err;
- unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+ struct vm_area_struct *vma = ves->vma;
- mmap_assert_write_locked(vma->vm_mm);
+ VM_WARN_ON_ONCE(ves->detached);
+
+ ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG,
+ &vma->vm_refcnt);
+ __vma_lockdep_release_exclusive(vma);
+}
+
+static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves)
+{
+ const unsigned int tgt = ves->detaching ? 0 : 1;
+
+ return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG;
+}
+
+/*
+ * Mark the VMA as being in a state of excluding readers, check to see if any
+ * VMA read locks are indeed held, and if so wait for them to be released.
+ *
+ * Note that this function pairs with vma_refcount_put() which will wake up this
+ * thread when it detects that the last reader has released its lock.
+ *
+ * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases
+ * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal
+ * signal is permitted to kill it.
+ *
+ * The function sets the ves->exclusive parameter to true if readers were
+ * excluded, or false if the VMA was detached or an error arose on wait.
+ *
+ * If the function indicates an exclusive lock was acquired via ves->exclusive
+ * the caller is required to invoke __vma_end_exclude_readers() once the
+ * exclusive state is no longer required.
+ *
+ * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the
+ * function may also return -EINTR to indicate a fatal signal was received while
+ * waiting. Otherwise, the function returns 0.
+ */
+static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves)
+{
+ struct vm_area_struct *vma = ves->vma;
+ unsigned int tgt_refcnt = get_target_refcnt(ves);
+ int err = 0;
- /* Additional refcnt if the vma is attached. */
- if (!detaching)
- tgt_refcnt++;
+ mmap_assert_write_locked(vma->vm_mm);
/*
* If vma is detached then only vma_mark_attached() can raise the
* vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ *
+ * See the comment describing the vm_area_struct->vm_refcnt field for
+ * details of possible refcnt values.
*/
- if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) {
+ ves->detached = true;
return 0;
+ }
- rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ __vma_lockdep_acquire_exclusive(vma);
err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- state);
+ ves->state);
if (err) {
- if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) {
- /*
- * The wait failed, but the last reader went away
- * as well. Tell the caller the VMA is detached.
- */
- WARN_ON_ONCE(!detaching);
- err = 0;
- }
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+ __vma_end_exclude_readers(ves);
return err;
}
- lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
- return 1;
-}
-
-static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
-{
- *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+ __vma_lockdep_stat_mark_acquired(vma);
+ ves->exclusive = true;
+ return 0;
}
-int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
- int state)
+int __vma_start_write(struct vm_area_struct *vma, int state)
{
- int locked;
+ const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma);
+ struct vma_exclude_readers_state ves = {
+ .vma = vma,
+ .state = state,
+ };
+ int err;
- locked = __vma_enter_locked(vma, false, state);
- if (locked < 0)
- return locked;
+ err = __vma_start_exclude_readers(&ves);
+ if (err) {
+ WARN_ON_ONCE(ves.detached);
+ return err;
+ }
/*
* We should use WRITE_ONCE() here because we can have concurrent reads
@@ -113,39 +159,42 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
*/
WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- if (locked) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(detached); /* vma should remain attached */
+ if (ves.exclusive) {
+ __vma_end_exclude_readers(&ves);
+ /* VMA should remain attached. */
+ WARN_ON_ONCE(ves.detached);
}
return 0;
}
EXPORT_SYMBOL_GPL(__vma_start_write);
-void vma_mark_detached(struct vm_area_struct *vma)
+void __vma_exclude_readers_for_detach(struct vm_area_struct *vma)
{
- vma_assert_write_locked(vma);
- vma_assert_attached(vma);
+ struct vma_exclude_readers_state ves = {
+ .vma = vma,
+ .state = TASK_UNINTERRUPTIBLE,
+ .detaching = true,
+ };
+ int err;
/*
- * We are the only writer, so no need to use vma_refcount_put().
- * The condition below is unlikely because the vma has been already
- * write-locked and readers can increment vm_refcnt only temporarily
- * before they check vm_lock_seq, realize the vma is locked and drop
- * back the vm_refcnt. That is a narrow window for observing a raised
- * vm_refcnt.
+ * Wait until the VMA is detached with no readers. Since we hold the VMA
+ * write lock, the only read locks that might be present are those from
+ * threads trying to acquire the read lock and incrementing the
+ * reference count before realising the write lock is held and
+ * decrementing it.
*/
- if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
- /* Wait until vma is detached with no readers. */
- if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(!detached);
- }
+ err = __vma_start_exclude_readers(&ves);
+ if (!err && ves.exclusive) {
+ /*
+ * Once this is complete, no readers can increment the
+ * reference count, and the VMA is marked detached.
+ */
+ __vma_end_exclude_readers(&ves);
}
+ /* If an error arose but we were detached anyway, we don't care. */
+ WARN_ON_ONCE(!ves.detached);
}
/*
@@ -180,19 +229,21 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
}
/*
- * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
- * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * If VM_REFCNT_EXCLUDE_READERS_FLAG is set,
+ * __refcount_inc_not_zero_limited_acquire() will fail because
+ * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG.
+ *
* Acquire fence is required here to avoid reordering against later
* vm_lock_seq check and checks inside lock_vma_under_rcu().
*/
if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT))) {
+ VM_REFCNT_LIMIT))) {
/* return EAGAIN if vma got detached from under us */
vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
goto err;
}
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+ __vma_lockdep_acquire_read(vma);
if (unlikely(vma->vm_mm != mm))
goto err_unstable;
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7468ec388455..fe5b6a031717 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
@@ -210,10 +211,9 @@ bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
PAGE_SIZE);
}
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
- bool delay_rmap, int page_size)
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
{
- return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
+ return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size);
}
#endif /* MMU_GATHER_NO_GATHER */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 283889e4f1ce..c0571445bef7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb,
is_private_single_threaded = vma_is_single_threaded_private(vma);
flush_tlb_batched_pending(vma->vm_mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
nr_ptes = 1;
oldpte = ptep_get(pte);
@@ -379,7 +379,7 @@ static long change_pte_range(struct mmu_gather *tlb,
}
}
} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(pte - 1, ptl);
return pages;
diff --git a/mm/mremap.c b/mm/mremap.c
index 672264807db6..8391ae17de64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
@@ -305,7 +305,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
}
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (force_flush)
flush_tlb_range(vma, old_end - len, old_end);
if (new_ptl != old_ptl)
@@ -678,7 +678,7 @@ static bool can_realign_addr(struct pagetable_move_control *pmc,
/*
* We don't want to have to go hunting for VMAs from the end of the old
* VMA to the next page table boundary, also we want to make sure the
- * operation is wortwhile.
+ * operation is worthwhile.
*
* So ensure that we only perform this realignment if the end of the
* range being copied reaches or crosses the page table boundary.
@@ -926,7 +926,7 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm)
/*
* Will a new address definitely be assigned? This either if the user specifies
* it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will
- * always detemrine a target address.
+ * always determine a target address.
*/
static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
{
@@ -1806,7 +1806,7 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
/*
* move_vma() need us to stay 4 maps below the threshold, otherwise
* it will bail out at the very beginning.
- * That is a problem if we have already unmaped the regions here
+ * That is a problem if we have already unmapped the regions here
* (new_addr, and old_addr), because userspace will not know the
* state of the vma's after it gets -ENOMEM.
* So, to avoid such scenario we can pre-compute if the whole
diff --git a/mm/mseal.c b/mm/mseal.c
index ae442683c5c0..316b5e1dec78 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -21,7 +21,7 @@
* It disallows unmapped regions from start to end whether they exist at the
* start, in the middle, or at the end of the range, or any combination thereof.
*
- * This is because after sealng a range, there's nothing to stop memory mapping
+ * This is because after sealing a range, there's nothing to stop memory mapping
* of ranges in the remaining gaps later, meaning that the user might then
* wrongly consider the entirety of the mseal()'d range to be sealed when it
* in fact isn't.
@@ -124,7 +124,7 @@ static int mseal_apply(struct mm_struct *mm,
* -EINVAL:
* invalid input flags.
* start address is not page aligned.
- * Address arange (start + len) overflow.
+ * Address range (start + len) overflow.
* -ENOMEM:
* addr is not a valid address (not allocated).
* end (start + len) is not a valid address.
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index 8f5735fda0a2..391f53e63ea3 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -467,7 +467,7 @@ int __init numa_memblks_init(int (*init_func)(void),
* We reset memblock back to the top-down direction
* here because if we configured ACPI_NUMA, we have
* parsed SRAT in init_func(). It is ok to have the
- * reset here even if we did't configure ACPI_NUMA
+ * reset here even if we didn't configure ACPI_NUMA
* or acpi numa init fails and fallbacks to dummy
* numa init.
*/
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5eb11fbba704..5c6c95c169ee 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -228,7 +228,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+ points = get_mm_rss_sum(p->mm) + get_mm_counter_sum(p->mm, MM_SWAPENTS) +
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
@@ -402,10 +402,10 @@ static int dump_task(struct task_struct *p, void *arg)
pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES),
- get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm),
- get_mm_counter(task->mm, MM_SWAPENTS),
+ task->tgid, task->mm->total_vm, get_mm_rss_sum(task->mm),
+ get_mm_counter_sum(task->mm, MM_ANONPAGES), get_mm_counter_sum(task->mm, MM_FILEPAGES),
+ get_mm_counter_sum(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm),
+ get_mm_counter_sum(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
@@ -458,7 +458,7 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim)
static void dump_header(struct oom_control *oc)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%d\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
@@ -604,9 +604,9 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
- K(get_mm_counter(mm, MM_ANONPAGES)),
- K(get_mm_counter(mm, MM_FILEPAGES)),
- K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ K(get_mm_counter_sum(mm, MM_ANONPAGES)),
+ K(get_mm_counter_sum(mm, MM_FILEPAGES)),
+ K(get_mm_counter_sum(mm, MM_SHMEMPAGES)));
out_finish:
trace_finish_task_reaping(tsk->pid);
out_unlock:
@@ -958,11 +958,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n",
message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
- K(get_mm_counter(mm, MM_ANONPAGES)),
- K(get_mm_counter(mm, MM_FILEPAGES)),
- K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ K(get_mm_counter_sum(mm, MM_ANONPAGES)),
+ K(get_mm_counter_sum(mm, MM_FILEPAGES)),
+ K(get_mm_counter_sum(mm, MM_SHMEMPAGES)),
from_kuid(&init_user_ns, task_uid(victim)),
mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
task_unlock(victim);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ccdeb0e84d39..601a5e048d12 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval);
*/
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
-/*
- * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
- * a full sync is triggered after this time elapses without any disk activity.
- */
-int laptop_mode;
-
-EXPORT_SYMBOL(laptop_mode);
-
/* End of sysctl-exported parameters */
struct wb_domain global_wb_domain;
@@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
balance_domain_limits(mdtc, strictlimit);
}
- /*
- * In laptop mode, we wait until hitting the higher threshold
- * before starting background writeout, and then write out all
- * the way down to the lower threshold. So slow writers cause
- * minimal disk activity.
- *
- * In normal mode, we start background writeout at the lower
- * background_thresh, to keep the amount of dirty memory low.
- */
- if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
- !writeback_in_progress(wb))
+ if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb))
wb_start_background_writeback(wb);
/*
@@ -1876,10 +1858,6 @@ free_running:
break;
}
- /* Start writeback even when in laptop mode */
- if (unlikely(!writeback_in_progress(wb)))
- wb_start_background_writeback(wb);
-
mem_cgroup_flush_foreign(wb);
/*
@@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int
}
#endif
-void laptop_mode_timer_fn(struct timer_list *t)
-{
- struct backing_dev_info *backing_dev_info =
- timer_container_of(backing_dev_info, t, laptop_mode_wb_timer);
-
- wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
-}
-
-/*
- * We've spun up the disk and we're in laptop mode: schedule writeback
- * of all dirty data a few seconds from now. If the flush is already scheduled
- * then push it back - the user is still using the disk.
- */
-void laptop_io_completion(struct backing_dev_info *info)
-{
- mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
-}
-
-/*
- * We're in laptop mode and we've just synced. The sync's writes will have
- * caused another writeback to be scheduled by laptop_io_completion.
- * Nothing needs to be written back anymore, so we unschedule the writeback.
- */
-void laptop_sync_completion(void)
-{
- struct backing_dev_info *bdi;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- timer_delete(&bdi->laptop_mode_wb_timer);
-
- rcu_read_unlock();
-}
-
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
* if a large number of processes all perform writes at the same time.
@@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu)
#ifdef CONFIG_SYSCTL
+static int laptop_mode;
+static int laptop_mode_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos);
+
+ if (!ret && write)
+ pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n",
+ current->comm);
+
+ return ret;
+}
+
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = {
.data = &laptop_mode,
.maxlen = sizeof(laptop_mode),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = laptop_mode_handler,
},
};
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d312ebaa1e77..5fd9e4a03a4d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/page_alloc.c
*
* Manages the free list, the system allocates free pages here.
* Note that kmalloc() lives in slab.c
@@ -1853,7 +1852,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/*
* As memory initialization might be integrated into KASAN,
- * KASAN unpoisoning and memory initializion code must be
+ * KASAN unpoisoning and memory initialization code must be
* kept together to avoid discrepancies in behavior.
*/
@@ -2946,9 +2945,9 @@ static bool free_frozen_page_commit(struct zone *zone,
* 'hopeless node' to stay in that state for a while. Let
* kswapd work again by resetting kswapd_failures.
*/
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+ if (kswapd_test_hopeless(pgdat) &&
next_memory_node(pgdat->node_id) < MAX_NUMNODES)
- atomic_set(&pgdat->kswapd_failures, 0);
+ kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP);
}
return ret;
}
@@ -3112,6 +3111,15 @@ void free_unref_folios(struct folio_batch *folios)
folio_batch_reinit(folios);
}
+static void __split_page(struct page *page, unsigned int order)
+{
+ VM_WARN_ON_PAGE(PageCompound(page), page);
+
+ split_page_owner(page, order, 0);
+ pgalloc_tag_split(page_folio(page), order, 0);
+ split_page_memcg(page, order);
+}
+
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
@@ -3124,14 +3132,12 @@ void split_page(struct page *page, unsigned int order)
{
int i;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- VM_BUG_ON_PAGE(!page_count(page), page);
+ VM_WARN_ON_PAGE(!page_count(page), page);
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order, 0);
- pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order);
+
+ __split_page(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -4699,7 +4705,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
- bool can_compact = gfp_compaction_allowed(gfp_mask);
+ bool can_compact = can_direct_reclaim && gfp_compaction_allowed(gfp_mask);
bool nofail = gfp_mask & __GFP_NOFAIL;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
@@ -4712,6 +4718,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int cpuset_mems_cookie;
unsigned int zonelist_iter_cookie;
int reserve_flags;
+ bool compact_first = false;
+ bool can_retry_reserves = true;
if (unlikely(nofail)) {
/*
@@ -4736,6 +4744,19 @@ restart:
zonelist_iter_cookie = zonelist_iter_begin();
/*
+ * For costly allocations, try direct compaction first, as it's likely
+ * that we have enough base pages and don't need to reclaim. For non-
+ * movable high-order allocations, do that as well, as compaction will
+ * try prevent permanent fragmentation by migrating from blocks of the
+ * same migratetype.
+ */
+ if (can_compact && (costly_order || (order > 0 &&
+ ac->migratetype != MIGRATE_MOVABLE))) {
+ compact_first = true;
+ compact_priority = INIT_COMPACT_PRIORITY;
+ }
+
+ /*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
@@ -4766,6 +4787,8 @@ restart:
goto nopage;
}
+retry:
+ /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4777,74 +4800,6 @@ restart:
if (page)
goto got_pg;
- /*
- * For costly allocations, try direct compaction first, as it's likely
- * that we have enough base pages and don't need to reclaim. For non-
- * movable high-order allocations, do that as well, as compaction will
- * try prevent permanent fragmentation by migrating from blocks of the
- * same migratetype.
- * Don't try this for allocations that are allowed to ignore
- * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
- */
- if (can_direct_reclaim && can_compact &&
- (costly_order ||
- (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
- && !gfp_pfmemalloc_allowed(gfp_mask)) {
- page = __alloc_pages_direct_compact(gfp_mask, order,
- alloc_flags, ac,
- INIT_COMPACT_PRIORITY,
- &compact_result);
- if (page)
- goto got_pg;
-
- /*
- * Checks for costly allocations with __GFP_NORETRY, which
- * includes some THP page fault allocations
- */
- if (costly_order && (gfp_mask & __GFP_NORETRY)) {
- /*
- * If allocating entire pageblock(s) and compaction
- * failed because all zones are below low watermarks
- * or is prohibited because it recently failed at this
- * order, fail immediately unless the allocator has
- * requested compaction and reclaim retry.
- *
- * Reclaim is
- * - potentially very expensive because zones are far
- * below their low watermarks or this is part of very
- * bursty high order allocations,
- * - not guaranteed to help because isolate_freepages()
- * may not iterate over freed pages as part of its
- * linear scan, and
- * - unlikely to make entire pageblocks free on its
- * own.
- */
- if (compact_result == COMPACT_SKIPPED ||
- compact_result == COMPACT_DEFERRED)
- goto nopage;
-
- /*
- * Looks like reclaim/compaction is worth trying, but
- * sync compaction could be very expensive, so keep
- * using async compaction.
- */
- compact_priority = INIT_COMPACT_PRIORITY;
- }
- }
-
-retry:
- /*
- * Deal with possible cpuset update races or zonelist updates to avoid
- * infinite retries.
- */
- if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
- check_retry_zonelist(zonelist_iter_cookie))
- goto restart;
-
- /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
- if (alloc_flags & ALLOC_KSWAPD)
- wake_all_kswapds(order, gfp_mask, ac);
-
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
@@ -4859,12 +4814,18 @@ retry:
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
- }
- /* Attempt with potentially adjusted zonelist and alloc_flags */
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
- if (page)
- goto got_pg;
+ /*
+ * The first time we adjust anything due to being allowed to
+ * ignore memory policies or watermarks, retry immediately. This
+ * allows us to keep the first allocation attempt optimistic so
+ * it can succeed in a zone that is still above watermarks.
+ */
+ if (can_retry_reserves) {
+ can_retry_reserves = false;
+ goto retry;
+ }
+ }
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
@@ -4875,10 +4836,12 @@ retry:
goto nopage;
/* Try direct reclaim and then allocating */
- page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
- &did_some_progress);
- if (page)
- goto got_pg;
+ if (!compact_first) {
+ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags,
+ ac, &did_some_progress);
+ if (page)
+ goto got_pg;
+ }
/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
@@ -4886,6 +4849,33 @@ retry:
if (page)
goto got_pg;
+ if (compact_first) {
+ /*
+ * THP page faults may attempt local node only first, but are
+ * then allowed to only compact, not reclaim, see
+ * alloc_pages_mpol().
+ *
+ * Compaction has failed above and we don't want such THP
+ * allocations to put reclaim pressure on a single node in a
+ * situation where other nodes might have plenty of available
+ * memory.
+ */
+ if (gfp_has_flags(gfp_mask, __GFP_NORETRY | __GFP_THISNODE))
+ goto nopage;
+
+ /*
+ * For the initial compaction attempt we have lowered its
+ * priority. Restore it for further retries, if those are
+ * allowed. With __GFP_NORETRY there will be a single round of
+ * reclaim and compaction with the lowered priority.
+ */
+ if (!(gfp_mask & __GFP_NORETRY))
+ compact_priority = DEF_COMPACT_PRIORITY;
+
+ compact_first = false;
+ goto retry;
+ }
+
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
@@ -4898,6 +4888,15 @@ retry:
!(gfp_mask & __GFP_RETRY_MAYFAIL)))
goto nopage;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries. No "goto retry;" can be placed above this check
+ * unless it can execute just once.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
@@ -5401,9 +5400,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
struct page *page = virt_to_page((void *)addr);
struct page *last = page + nr;
- split_page_owner(page, order, 0);
- pgalloc_tag_split(page_folio(page), order, 0);
- split_page_memcg(page, order);
+ __split_page(page, order);
while (page < --last)
set_page_refcounted(last);
@@ -6896,7 +6893,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
return (ret < 0) ? ret : 0;
}
-static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
+static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask)
{
int order;
@@ -6908,11 +6905,10 @@ static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
int i;
post_alloc_hook(page, order, gfp_mask);
- set_page_refcounted(page);
if (!order)
continue;
- split_page(page, order);
+ __split_page(page, order);
/* Add all subpages to the order-0 head, in sequence. */
list_del(&page->lru);
@@ -6956,8 +6952,14 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
return 0;
}
+static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
+{
+ for (; nr_pages--; pfn++)
+ free_frozen_pages(pfn_to_page(pfn), 0);
+}
+
/**
- * alloc_contig_range() -- tries to allocate given range of pages
+ * alloc_contig_frozen_range() -- tries to allocate given range of frozen pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
* @alloc_flags: allocation information
@@ -6972,12 +6974,15 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Return: zero on success or negative error code. On success all
- * pages which PFN is in [start, end) are allocated for the caller and
- * need to be freed with free_contig_range().
+ * All frozen pages which PFN is in [start, end) are allocated for the
+ * caller, and they could be freed with free_contig_frozen_range(),
+ * free_frozen_pages() also could be used to free compound frozen pages
+ * directly.
+ *
+ * Return: zero on success or negative error code.
*/
-int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- acr_flags_t alloc_flags, gfp_t gfp_mask)
+int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end,
+ acr_flags_t alloc_flags, gfp_t gfp_mask)
{
const unsigned int order = ilog2(end - start);
unsigned long outer_start, outer_end;
@@ -7093,19 +7098,18 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
}
if (!(gfp_mask & __GFP_COMP)) {
- split_free_pages(cc.freepages, gfp_mask);
+ split_free_frozen_pages(cc.freepages, gfp_mask);
/* Free head and tail (if any) */
if (start != outer_start)
- free_contig_range(outer_start, start - outer_start);
+ __free_contig_frozen_range(outer_start, start - outer_start);
if (end != outer_end)
- free_contig_range(end, outer_end - end);
+ __free_contig_frozen_range(end, outer_end - end);
} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
struct page *head = pfn_to_page(start);
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
- set_page_refcounted(head);
} else {
ret = -EINVAL;
WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
@@ -7115,36 +7119,86 @@ done:
undo_isolate_page_range(start, end);
return ret;
}
-EXPORT_SYMBOL(alloc_contig_range_noprof);
+EXPORT_SYMBOL(alloc_contig_frozen_range_noprof);
-static int __alloc_contig_pages(unsigned long start_pfn,
- unsigned long nr_pages, gfp_t gfp_mask)
+/**
+ * alloc_contig_range() -- tries to allocate given range of pages
+ * @start: start PFN to allocate
+ * @end: one-past-the-last PFN to allocate
+ * @alloc_flags: allocation information
+ * @gfp_mask: GFP mask.
+ *
+ * This routine is a wrapper around alloc_contig_frozen_range(), it can't
+ * be used to allocate compound pages, the refcount of each allocated page
+ * will be set to one.
+ *
+ * All pages which PFN is in [start, end) are allocated for the caller,
+ * and should be freed with free_contig_range() or by manually calling
+ * __free_page() on each allocated page.
+ *
+ * Return: zero on success or negative error code.
+ */
+int alloc_contig_range_noprof(unsigned long start, unsigned long end,
+ acr_flags_t alloc_flags, gfp_t gfp_mask)
{
- unsigned long end_pfn = start_pfn + nr_pages;
+ int ret;
- return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
- gfp_mask);
+ if (WARN_ON(gfp_mask & __GFP_COMP))
+ return -EINVAL;
+
+ ret = alloc_contig_frozen_range_noprof(start, end, alloc_flags, gfp_mask);
+ if (!ret)
+ set_pages_refcounted(pfn_to_page(start), end - start);
+
+ return ret;
}
+EXPORT_SYMBOL(alloc_contig_range_noprof);
static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
- unsigned long nr_pages)
+ unsigned long nr_pages, bool skip_hugetlb,
+ bool *skipped_hugetlb)
{
- unsigned long i, end_pfn = start_pfn + nr_pages;
+ unsigned long end_pfn = start_pfn + nr_pages;
struct page *page;
- for (i = start_pfn; i < end_pfn; i++) {
- page = pfn_to_online_page(i);
+ while (start_pfn < end_pfn) {
+ unsigned long step = 1;
+
+ page = pfn_to_online_page(start_pfn);
if (!page)
return false;
if (page_zone(page) != z)
return false;
- if (PageReserved(page))
+ if (page_is_unmovable(z, page, PB_ISOLATE_MODE_OTHER, &step))
return false;
- if (PageHuge(page))
- return false;
+ /*
+ * Only consider ranges containing hugepages if those pages are
+ * smaller than the requested contiguous region. e.g.:
+ * Move 2MB pages to free up a 1GB range.
+ * Don't move 1GB pages to free up a 2MB range.
+ *
+ * This makes contiguous allocation more reliable if multiple
+ * hugepage sizes are used without causing needless movement.
+ */
+ if (PageHuge(page)) {
+ unsigned int order;
+
+ if (skip_hugetlb) {
+ *skipped_hugetlb = true;
+ return false;
+ }
+
+ page = compound_head(page);
+ order = compound_order(page);
+ if ((order >= MAX_FOLIO_ORDER) ||
+ (nr_pages <= (1 << order)))
+ return false;
+ }
+
+ start_pfn += step;
}
return true;
}
@@ -7158,7 +7212,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,
}
/**
- * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
+ * alloc_contig_frozen_pages() -- tries to find and allocate contiguous range of frozen pages
* @nr_pages: Number of contiguous pages to allocate
* @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
* action and reclaim modifiers are supported. Reclaim modifiers
@@ -7166,28 +7220,34 @@ static bool zone_spans_last_pfn(const struct zone *zone,
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
- * This routine is a wrapper around alloc_contig_range(). It scans over zones
- * on an applicable zonelist to find a contiguous pfn range which can then be
- * tried for allocation with alloc_contig_range(). This routine is intended
- * for allocation requests which can not be fulfilled with the buddy allocator.
+ * This routine is a wrapper around alloc_contig_frozen_range(). It scans over
+ * zones on an applicable zonelist to find a contiguous pfn range which can then
+ * be tried for allocation with alloc_contig_frozen_range(). This routine is
+ * intended for allocation requests which can not be fulfilled with the buddy
+ * allocator.
*
* The allocated memory is always aligned to a page boundary. If nr_pages is a
* power of two, then allocated range is also guaranteed to be aligned to same
* nr_pages (e.g. 1GB request would be aligned to 1GB).
*
- * Allocated pages can be freed with free_contig_range() or by manually calling
- * __free_page() on each allocated page.
+ * Allocated frozen pages need be freed with free_contig_frozen_range(),
+ * or by manually calling free_frozen_pages() on each allocated frozen
+ * non-compound page, for compound frozen pages could be freed with
+ * free_frozen_pages() directly.
*
- * Return: pointer to contiguous pages on success, or NULL if not successful.
+ * Return: pointer to contiguous frozen pages on success, or NULL if not successful.
*/
-struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask)
+struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages,
+ gfp_t gfp_mask, int nid, nodemask_t *nodemask)
{
unsigned long ret, pfn, flags;
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ bool skip_hugetlb = true;
+ bool skipped_hugetlb = false;
+retry:
zonelist = node_zonelist(nid, gfp_mask);
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(gfp_mask), nodemask) {
@@ -7195,16 +7255,20 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
pfn = ALIGN(zone->zone_start_pfn, nr_pages);
while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
- if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_contig(zone, pfn, nr_pages,
+ skip_hugetlb,
+ &skipped_hugetlb)) {
/*
* We release the zone lock here because
- * alloc_contig_range() will also lock the zone
- * at some point. If there's an allocation
- * spinning on this lock, it may win the race
- * and cause alloc_contig_range() to fail...
+ * alloc_contig_frozen_range() will also lock
+ * the zone at some point. If there's an
+ * allocation spinning on this lock, it may
+ * win the race and cause allocation to fail.
*/
spin_unlock_irqrestore(&zone->lock, flags);
- ret = __alloc_contig_pages(pfn, nr_pages,
+ ret = alloc_contig_frozen_range_noprof(pfn,
+ pfn + nr_pages,
+ ACR_FLAGS_NONE,
gfp_mask);
if (!ret)
return pfn_to_page(pfn);
@@ -7214,35 +7278,96 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
}
spin_unlock_irqrestore(&zone->lock, flags);
}
+ /*
+ * If we failed, retry the search, but treat regions with HugeTLB pages
+ * as valid targets. This retains fast-allocations on first pass
+ * without trying to migrate HugeTLB pages (which may fail). On the
+ * second pass, we will try moving HugeTLB pages when those pages are
+ * smaller than the requested contiguous region size.
+ */
+ if (skip_hugetlb && skipped_hugetlb) {
+ skip_hugetlb = false;
+ goto retry;
+ }
return NULL;
}
-#endif /* CONFIG_CONTIG_ALLOC */
+EXPORT_SYMBOL(alloc_contig_frozen_pages_noprof);
-void free_contig_range(unsigned long pfn, unsigned long nr_pages)
+/**
+ * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
+ * @nr_pages: Number of contiguous pages to allocate
+ * @gfp_mask: GFP mask.
+ * @nid: Target node
+ * @nodemask: Mask for other possible nodes
+ *
+ * This routine is a wrapper around alloc_contig_frozen_pages(), it can't
+ * be used to allocate compound pages, the refcount of each allocated page
+ * will be set to one.
+ *
+ * Allocated pages can be freed with free_contig_range() or by manually
+ * calling __free_page() on each allocated page.
+ *
+ * Return: pointer to contiguous pages on success, or NULL if not successful.
+ */
+struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
{
- unsigned long count = 0;
- struct folio *folio = pfn_folio(pfn);
+ struct page *page;
- if (folio_test_large(folio)) {
- int expected = folio_nr_pages(folio);
+ if (WARN_ON(gfp_mask & __GFP_COMP))
+ return NULL;
- if (nr_pages == expected)
- folio_put(folio);
- else
- WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
- pfn, nr_pages, expected);
+ page = alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid,
+ nodemask);
+ if (page)
+ set_pages_refcounted(page, nr_pages);
+
+ return page;
+}
+EXPORT_SYMBOL(alloc_contig_pages_noprof);
+
+/**
+ * free_contig_frozen_range() -- free the contiguous range of frozen pages
+ * @pfn: start PFN to free
+ * @nr_pages: Number of contiguous frozen pages to free
+ *
+ * This can be used to free the allocated compound/non-compound frozen pages.
+ */
+void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages)
+{
+ struct page *first_page = pfn_to_page(pfn);
+ const unsigned int order = ilog2(nr_pages);
+
+ if (WARN_ON_ONCE(first_page != compound_head(first_page)))
+ return;
+
+ if (PageHead(first_page)) {
+ WARN_ON_ONCE(order != compound_order(first_page));
+ free_frozen_pages(first_page, order);
return;
}
- for (; nr_pages--; pfn++) {
- struct page *page = pfn_to_page(pfn);
+ __free_contig_frozen_range(pfn, nr_pages);
+}
+EXPORT_SYMBOL(free_contig_frozen_range);
- count += page_count(page) != 1;
- __free_page(page);
- }
- WARN(count != 0, "%lu pages are still in use!\n", count);
+/**
+ * free_contig_range() -- free the contiguous range of pages
+ * @pfn: start PFN to free
+ * @nr_pages: Number of contiguous pages to free
+ *
+ * This can be only used to free the allocated non-compound pages.
+ */
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
+{
+ if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
+ return;
+
+ for (; nr_pages--; pfn++)
+ __free_page(pfn_to_page(pfn));
}
EXPORT_SYMBOL(free_contig_range);
+#endif /* CONFIG_CONTIG_ALLOC */
/*
* Effectively disable pcplists for the zone by setting the high limit to 0
@@ -7658,7 +7783,7 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
* task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
* mark the task as the owner of another rt_spin_lock which will
- * confuse PI logic, so return immediately if called form hard IRQ or
+ * confuse PI logic, so return immediately if called from hard IRQ or
* NMI.
*
* Note, irqs_disabled() case is ok. This function can be called
diff --git a/mm/page_io.c b/mm/page_io.c
index 3c342db77ce3..a2c034660c80 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -450,14 +450,14 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
/*
- * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * ->flags can be updated non-atomically (scan_swap_map_slots),
* but that will never affect SWP_FS_OPS, so the data_race
* is safe.
*/
if (data_race(sis->flags & SWP_FS_OPS))
swap_writepage_fs(folio, swap_plug);
/*
- * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * ->flags can be updated non-atomically (scan_swap_map_slots),
* but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
* is safe.
*/
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f72b6cd38b95..c48ff5c00244 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,6 +15,100 @@
#define CREATE_TRACE_POINTS
#include <trace/events/page_isolation.h>
+bool page_is_unmovable(struct zone *zone, struct page *page,
+ enum pb_isolate_mode mode, unsigned long *step)
+{
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
+ if (PageReserved(page))
+ return true;
+
+ /*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ return false;
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * THPs are on the LRU, but need to be counted as #small pages.
+ * We need not scan over tail pages because we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page) || PageCompound(page)) {
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_hugetlb(folio)) {
+ struct hstate *h;
+
+ if (!IS_ENABLED(CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION))
+ return true;
+
+ /*
+ * The huge page may be freed so can not
+ * use folio_hstate() directly.
+ */
+ h = size_to_hstate(folio_size(folio));
+ if (h && !hugepage_migration_supported(h))
+ return true;
+
+ } else if (!folio_test_lru(folio)) {
+ return true;
+ }
+
+ *step = folio_nr_pages(folio) - folio_page_idx(folio, page);
+ return false;
+ }
+
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_refcount is zero at all time.
+ */
+ if (!page_ref_count(page)) {
+ if (PageBuddy(page))
+ *step = (1 << buddy_order(page));
+ return false;
+ }
+
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page))
+ return false;
+
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
+ return false;
+
+ if (PageLRU(page) || page_has_movable_ops(page))
+ return false;
+
+ /*
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
+ */
+ return true;
+}
+
/*
* This function checks whether the range [start_pfn, end_pfn) includes
* unmovable pages or not. The range must fall into a single pageblock and
@@ -35,7 +129,6 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
{
struct page *page = pfn_to_page(start_pfn);
struct zone *zone = page_zone(page);
- unsigned long pfn;
VM_BUG_ON(pageblock_start_pfn(start_pfn) !=
pageblock_start_pfn(end_pfn - 1));
@@ -52,96 +145,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
return page;
}
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- page = pfn_to_page(pfn);
+ while (start_pfn < end_pfn) {
+ unsigned long step = 1;
- /*
- * Both, bootmem allocations and memory holes are marked
- * PG_reserved and are unmovable. We can even have unmovable
- * allocations inside ZONE_MOVABLE, for example when
- * specifying "movablecore".
- */
- if (PageReserved(page))
+ page = pfn_to_page(start_pfn);
+ if (page_is_unmovable(zone, page, mode, &step))
return page;
- /*
- * If the zone is movable and we have ruled out all reserved
- * pages then it should be reasonably safe to assume the rest
- * is movable.
- */
- if (zone_idx(zone) == ZONE_MOVABLE)
- continue;
-
- /*
- * Hugepages are not in LRU lists, but they're movable.
- * THPs are on the LRU, but need to be counted as #small pages.
- * We need not scan over tail pages because we don't
- * handle each tail page individually in migration.
- */
- if (PageHuge(page) || PageTransCompound(page)) {
- struct folio *folio = page_folio(page);
- unsigned int skip_pages;
-
- if (PageHuge(page)) {
- struct hstate *h;
-
- /*
- * The huge page may be freed so can not
- * use folio_hstate() directly.
- */
- h = size_to_hstate(folio_size(folio));
- if (h && !hugepage_migration_supported(h))
- return page;
- } else if (!folio_test_lru(folio)) {
- return page;
- }
-
- skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page);
- pfn += skip_pages - 1;
- continue;
- }
-
- /*
- * We can't use page_count without pin a page
- * because another CPU can free compound page.
- * This check already skips compound tails of THP
- * because their page->_refcount is zero at all time.
- */
- if (!page_ref_count(page)) {
- if (PageBuddy(page))
- pfn += (1 << buddy_order(page)) - 1;
- continue;
- }
-
- /*
- * The HWPoisoned page may be not in buddy system, and
- * page_count() is not 0.
- */
- if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page))
- continue;
-
- /*
- * We treat all PageOffline() pages as movable when offlining
- * to give drivers a chance to decrement their reference count
- * in MEM_GOING_OFFLINE in order to indicate that these pages
- * can be offlined as there are no direct references anymore.
- * For actually unmovable PageOffline() where the driver does
- * not support this, we will fail later when trying to actually
- * move these pages that still have a reference count > 0.
- * (false negatives in this function only)
- */
- if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
- continue;
-
- if (PageLRU(page) || page_has_movable_ops(page))
- continue;
-
- /*
- * If there are RECLAIMABLE pages, we need to check
- * it. But now, memory offline itself doesn't call
- * shrink_node_slabs() and it still to be fixed.
- */
- return page;
+ start_pfn += step;
}
return NULL;
}
@@ -301,7 +312,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* pageblock. When not all pageblocks within a page are isolated at the same
* time, free page accounting can go wrong. For example, in the case of
* MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two
- * pagelbocks.
+ * pageblocks.
* [ MAX_PAGE_ORDER ]
* [ pageblock0 | pageblock1 ]
* When either pageblock is isolated, if it is a free page, the page is not
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index e4c428e61d8c..8a03effda749 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -123,7 +123,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
continue;
/*
- * If page was not comingled with another page we can
+ * If page was not commingled with another page we can
* consider the result to be "reported" since the page
* hasn't been modified, otherwise we will need to
* report on the new larger page when we make our way
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 741884645ab0..2708c2b3ac1f 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -145,34 +145,37 @@ void __page_table_check_zero(struct page *page, unsigned int order)
rcu_read_unlock();
}
-void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
+void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t pte)
{
if (&init_mm == mm)
return;
- if (pte_user_accessible_page(pte)) {
+ if (pte_user_accessible_page(pte, addr)) {
page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pte_clear);
-void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
+void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t pmd)
{
if (&init_mm == mm)
return;
- if (pmd_user_accessible_page(pmd)) {
+ if (pmd_user_accessible_page(pmd, addr)) {
page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);
-void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
+void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
+ pud_t pud)
{
if (&init_mm == mm)
return;
- if (pud_user_accessible_page(pud)) {
+ if (pud_user_accessible_page(pud, addr)) {
page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
}
}
@@ -196,8 +199,8 @@ static void page_table_check_pte_flags(pte_t pte)
}
}
-void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
- unsigned int nr)
+void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
{
unsigned int i;
@@ -207,8 +210,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
page_table_check_pte_flags(pte);
for (i = 0; i < nr; i++)
- __page_table_check_pte_clear(mm, ptep_get(ptep + i));
- if (pte_user_accessible_page(pte))
+ __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i));
+ if (pte_user_accessible_page(pte, addr))
page_table_check_set(pte_pfn(pte), nr, pte_write(pte));
}
EXPORT_SYMBOL(__page_table_check_ptes_set);
@@ -225,8 +228,8 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
}
}
-void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
- unsigned int nr)
+void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd, unsigned int nr)
{
unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
unsigned int i;
@@ -237,14 +240,14 @@ void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
page_table_check_pmd_flags(pmd);
for (i = 0; i < nr; i++)
- __page_table_check_pmd_clear(mm, *(pmdp + i));
- if (pmd_user_accessible_page(pmd))
+ __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd, addr))
page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
- unsigned int nr)
+void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud, unsigned int nr)
{
unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
unsigned int i;
@@ -253,8 +256,8 @@ void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
return;
for (i = 0; i < nr; i++)
- __page_table_check_pud_clear(mm, *(pudp + i));
- if (pud_user_accessible_page(pud))
+ __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i));
+ if (pud_user_accessible_page(pud, addr))
page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
EXPORT_SYMBOL(__page_table_check_puds_set);
@@ -273,7 +276,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
if (WARN_ON(!ptep))
return;
for (i = 0; i < PTRS_PER_PTE; i++) {
- __page_table_check_pte_clear(mm, ptep_get(ptep));
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
addr += PAGE_SIZE;
ptep++;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 90cc346a6ecf..a94c401ab2cf 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -313,7 +313,8 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
{
unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
- return boundary < end ? boundary : end;
+
+ return min(boundary, end);
}
static int walk_hugetlb_range(unsigned long addr, unsigned long end,
diff --git a/mm/percpu.c b/mm/percpu.c
index 81462ce5866e..a2107bdebf0b 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1279,12 +1279,16 @@ static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
int bit_off, bits, end, oslot, freed;
lockdep_assert_held(&pcpu_lock);
- pcpu_stats_area_dealloc(chunk);
oslot = pcpu_chunk_slot(chunk);
bit_off = off / PCPU_MIN_ALLOC_SIZE;
+ /* check invalid free */
+ if (!test_bit(bit_off, chunk->alloc_map) ||
+ !test_bit(bit_off, chunk->bound_map))
+ return 0;
+
/* find end index */
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
bit_off + 1);
@@ -1303,6 +1307,8 @@ static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
pcpu_chunk_relocate(chunk, oslot);
+ pcpu_stats_area_dealloc(chunk);
+
return freed;
}
@@ -2242,6 +2248,13 @@ void free_percpu(void __percpu *ptr)
spin_lock_irqsave(&pcpu_lock, flags);
size = pcpu_free_area(chunk, off);
+ if (size == 0) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /* invalid percpu free */
+ WARN_ON_ONCE(1);
+ return;
+ }
pcpu_alloc_tag_free_hook(chunk, off, size);
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
deleted file mode 100644
index 0d9cfbf4fe5d..000000000000
--- a/mm/pt_reclaim.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/hugetlb.h>
-#include <linux/pgalloc.h>
-
-#include <asm-generic/tlb.h>
-
-#include "internal.h"
-
-bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
- struct zap_details *details)
-{
- return details && details->reclaim_pt && (end - start >= PMD_SIZE);
-}
-
-bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
-{
- spinlock_t *pml = pmd_lockptr(mm, pmd);
-
- if (!spin_trylock(pml))
- return false;
-
- *pmdval = pmdp_get_lockless(pmd);
- pmd_clear(pmd);
- spin_unlock(pml);
-
- return true;
-}
-
-void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
- pmd_t pmdval)
-{
- pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
- mm_dec_nr_ptes(mm);
-}
-
-void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
- struct mmu_gather *tlb)
-{
- pmd_t pmdval;
- spinlock_t *pml, *ptl = NULL;
- pte_t *start_pte, *pte;
- int i;
-
- pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl);
- if (!start_pte)
- goto out_ptl;
- if (ptl != pml)
- spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
-
- /* Check if it is empty PTE page */
- for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
- if (!pte_none(ptep_get(pte)))
- goto out_ptl;
- }
- pte_unmap(start_pte);
-
- pmd_clear(pmd);
-
- if (ptl != pml)
- spin_unlock(ptl);
- spin_unlock(pml);
-
- free_pte(mm, addr, tlb, pmdval);
-
- return;
-out_ptl:
- if (start_pte)
- pte_unmap_unlock(start_pte, ptl);
- if (ptl != pml)
- spin_unlock(pml);
-}
diff --git a/mm/readahead.c b/mm/readahead.c
index f43d03558e62..7b05082c89ea 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -439,7 +439,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
* based on I/O request size and the max_readahead.
*
* The code ramps up the readahead size aggressively at first, but slow down as
- * it approaches max_readhead.
+ * it approaches max_readahead.
*/
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
diff --git a/mm/rmap.c b/mm/rmap.c
index 7b9879ef442d..ab099405151f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* mm/rmap.c - physical to virtual reverse mappings
*
* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
- * Released under the General Public License (GPL).
*
* Simple, low overhead reverse mapping scheme.
* Please try to keep this thing as modular as possible.
@@ -82,6 +82,7 @@
#include <trace/events/migrate.h>
#include "internal.h"
+#include "swap.h"
static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;
@@ -146,14 +147,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
-static void anon_vma_chain_link(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
+static void anon_vma_chain_assign(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
{
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -210,7 +210,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
@@ -231,97 +232,141 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
return -ENOMEM;
}
-/*
- * This is a useful helper function for locking the anon_vma root as
- * we traverse the vma->anon_vma_chain, looping over anon_vma's that
- * have the same vma.
- *
- * Such anon_vma's should have the same root, so you'd expect to see
- * just a single mutex_lock for the whole traversal.
- */
-static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
-{
- struct anon_vma *new_root = anon_vma->root;
- if (new_root != root) {
- if (WARN_ON_ONCE(root))
- up_write(&root->rwsem);
- root = new_root;
- down_write(&root->rwsem);
- }
- return root;
+static void check_anon_vma_clone(struct vm_area_struct *dst,
+ struct vm_area_struct *src,
+ enum vma_operation operation)
+{
+ /* The write lock must be held. */
+ mmap_assert_write_locked(src->vm_mm);
+ /* If not a fork then must be on same mm. */
+ VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
+
+ /* If we have anything to do src->anon_vma must be provided. */
+ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
+ VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
+ /* We are establishing a new anon_vma_chain. */
+ VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
+ /*
+ * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
+ * must be the same across dst and src.
+ */
+ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
+ /*
+ * Essentially equivalent to above - if not a no-op, we should expect
+ * dst->anon_vma to be set for everything except a fork.
+ */
+ VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma &&
+ !dst->anon_vma);
+ /* For the anon_vma to be compatible, it can only be singular. */
+ VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
+ !list_is_singular(&src->anon_vma_chain));
+#ifdef CONFIG_PER_VMA_LOCK
+ /* Only merging an unfaulted VMA leaves the destination attached. */
+ VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
+ vma_is_attached(dst));
+#endif
}
-static inline void unlock_anon_vma_root(struct anon_vma *root)
+static void maybe_reuse_anon_vma(struct vm_area_struct *dst,
+ struct anon_vma *anon_vma)
{
- if (root)
- up_write(&root->rwsem);
+ /* If already populated, nothing to do.*/
+ if (dst->anon_vma)
+ return;
+
+ /*
+ * We reuse an anon_vma if any linking VMAs were unmapped and it has
+ * only a single child at most.
+ */
+ if (anon_vma->num_active_vmas > 0)
+ return;
+ if (anon_vma->num_children > 1)
+ return;
+
+ dst->anon_vma = anon_vma;
+ anon_vma->num_active_vmas++;
}
-/*
- * Attach the anon_vmas from src to dst.
- * Returns 0 on success, -ENOMEM on failure.
- *
- * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
- * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
- * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
- * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
- * call, we can identify this case by checking (!dst->anon_vma &&
- * src->anon_vma).
- *
- * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
- * and reuse existing anon_vma which has no vmas and only one child anon_vma.
- * This prevents degradation of anon_vma hierarchy to endless linear chain in
- * case of constantly forking task. On the other hand, an anon_vma with more
- * than one child isn't reused even if there was no alive vma, thus rmap
- * walker has a good chance of avoiding scanning the whole hierarchy when it
- * searches where page is mapped.
+static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
+
+/**
+ * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
+ * all of the anon_vma objects contained within @src anon_vma_chain's.
+ * @dst: The destination VMA with an empty anon_vma_chain.
+ * @src: The source VMA we wish to duplicate.
+ * @operation: The type of operation which resulted in the clone.
+ *
+ * This is the heart of the VMA side of the anon_vma implementation - we invoke
+ * this function whenever we need to set up a new VMA's anon_vma state.
+ *
+ * This is invoked for:
+ *
+ * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
+ * clone @src into @dst.
+ * - VMA split.
+ * - VMA (m)remap.
+ * - Fork of faulted VMA.
+ *
+ * In all cases other than fork this is simply a duplication. Fork additionally
+ * adds a new active anon_vma.
+ *
+ * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
+ * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
+ * but do have a single child. This is to avoid waste of memory when repeatedly
+ * forking.
+ *
+ * Returns: 0 on success, -ENOMEM on failure.
*/
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation)
{
struct anon_vma_chain *avc, *pavc;
- struct anon_vma *root = NULL;
-
- list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma;
-
- avc = anon_vma_chain_alloc(GFP_NOWAIT);
- if (unlikely(!avc)) {
- unlock_anon_vma_root(root);
- root = NULL;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto enomem_failure;
- }
- anon_vma = pavc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
- anon_vma_chain_link(dst, avc, anon_vma);
+ struct anon_vma *active_anon_vma = src->anon_vma;
- /*
- * Reuse existing anon_vma if it has no vma and only one
- * anon_vma child.
- *
- * Root anon_vma is never reused:
- * it has self-parent reference and at least one child.
- */
- if (!dst->anon_vma && src->anon_vma &&
- anon_vma->num_children < 2 &&
- anon_vma->num_active_vmas == 0)
- dst->anon_vma = anon_vma;
+ check_anon_vma_clone(dst, src, operation);
+
+ if (!active_anon_vma)
+ return 0;
+
+ /*
+ * Allocate AVCs. We don't need an anon_vma lock for this as we
+ * are not updating the anon_vma rbtree nor are we changing
+ * anon_vma statistics.
+ *
+ * Either src, dst have the same mm for which we hold an exclusive mmap
+ * write lock, or we are forking and we hold it on src->vm_mm and dst is
+ * not yet accessible to other threads so there's no possibliity of the
+ * unlinked AVC's being observed yet.
+ */
+ list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+
+ anon_vma_chain_assign(dst, avc, pavc->anon_vma);
}
- if (dst->anon_vma)
+
+ /*
+ * Now link the anon_vma's back to the newly inserted AVCs.
+ * Note that all anon_vma's share the same root.
+ */
+ anon_vma_lock_write(src->anon_vma);
+ list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
+ if (operation == VMA_OP_FORK)
+ maybe_reuse_anon_vma(dst, anon_vma);
+ }
+
+ if (operation != VMA_OP_FORK)
dst->anon_vma->num_active_vmas++;
- unlock_anon_vma_root(root);
+
+ anon_vma_unlock_write(active_anon_vma);
return 0;
enomem_failure:
- /*
- * dst->anon_vma is dropped here otherwise its num_active_vmas can
- * be incorrectly decremented in unlink_anon_vmas().
- * We can safely do this because callers of anon_vma_clone() don't care
- * about dst->anon_vma if anon_vma_clone() failed.
- */
- dst->anon_vma = NULL;
- unlink_anon_vmas(dst);
+ cleanup_partial_anon_vmas(dst);
return -ENOMEM;
}
@@ -334,7 +379,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
- int error;
+ int rc;
/* Don't bother if the parent process has no anon_vma here. */
if (!pvma->anon_vma)
@@ -343,27 +388,35 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
vma->anon_vma = NULL;
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ return -ENOMEM;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc) {
+ put_anon_vma(anon_vma);
+ return -ENOMEM;
+ }
+
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
- error = anon_vma_clone(vma, pvma);
- if (error)
- return error;
-
- /* An existing anon_vma has been reused, all done then. */
- if (vma->anon_vma)
- return 0;
+ rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
+ /* An error arose or an existing anon_vma was reused, all done then. */
+ if (rc || vma->anon_vma) {
+ put_anon_vma(anon_vma);
+ anon_vma_chain_free(avc);
+ return rc;
+ }
- /* Then add our own anon_vma. */
- anon_vma = anon_vma_alloc();
- if (!anon_vma)
- goto out_error;
- anon_vma->num_active_vmas++;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_error_free_anon_vma;
+ /*
+ * OK no reuse, so add our own anon_vma.
+ *
+ * Since it is not linked anywhere we can safely manipulate anon_vma
+ * fields without a lock.
+ */
+ anon_vma->num_active_vmas = 1;
/*
* The root anon_vma's rwsem is the lock actually used when we
* lock any of the anon_vmas in this anon_vma tree.
@@ -378,24 +431,59 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ /* Now let rmap see it. */
anon_vma_lock_write(anon_vma);
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
+}
- out_error_free_anon_vma:
- put_anon_vma(anon_vma);
- out_error:
- unlink_anon_vmas(vma);
- return -ENOMEM;
+/*
+ * In the unfortunate case of anon_vma_clone() failing to allocate memory we
+ * have to clean things up.
+ *
+ * Since we allocate anon_vma_chain's before we insert them into the interval
+ * trees, we simply have to free up the AVC's and remove the entries from the
+ * VMA's anon_vma_chain.
+ */
+static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc, *next;
+
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
}
+/**
+ * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
+ * anon_vma_chain objects.
+ * @vma: The VMA whose links to anon_vma objects is to be severed.
+ *
+ * As part of the process anon_vma_chain's are freed,
+ * anon_vma->num_children,num_active_vmas is updated as required and, if the
+ * relevant anon_vma references no further VMAs, its reference count is
+ * decremented.
+ */
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
- struct anon_vma *root = NULL;
+ struct anon_vma *active_anon_vma = vma->anon_vma;
+
+ /* Always hold mmap lock, read-lock on unmap possibly. */
+ mmap_assert_locked(vma->vm_mm);
+
+ /* Unfaulted is a no-op. */
+ if (!active_anon_vma) {
+ VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain));
+ return;
+ }
+
+ anon_vma_lock_write(active_anon_vma);
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
@@ -404,7 +492,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
@@ -419,16 +506,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
- if (vma->anon_vma) {
- vma->anon_vma->num_active_vmas--;
- /*
- * vma would still be needed after unlink, and anon_vma will be prepared
- * when handle fault.
- */
- vma->anon_vma = NULL;
- }
- unlock_anon_vma_root(root);
+ active_anon_vma->num_active_vmas--;
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
+ anon_vma_unlock_write(active_anon_vma);
+
/*
* Iterate the list once more, it now only contains empty and unlinked
@@ -2147,7 +2233,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
goto discard;
}
- if (swap_duplicate(entry) < 0) {
+ if (folio_dup_swap(folio, subpage) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
@@ -2158,7 +2244,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* so we'll not check/care.
*/
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- swap_free(entry);
+ folio_put_swap(folio, subpage);
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
@@ -2166,7 +2252,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* See folio_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
folio_try_share_anon_rmap_pte(folio, subpage)) {
- swap_free(entry);
+ folio_put_swap(folio, subpage);
set_pte_at(mm, address, pvmw.pte, pteval);
goto walk_abort;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 063b4c3e4ccb..c40d786a21c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Resizable virtual memory filesystem for Linux.
*
@@ -17,8 +18,6 @@
*
* tiny-shmem:
* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
- *
- * This file is released under the GPL.
*/
#include <linux/fs.h>
@@ -983,7 +982,7 @@ static long shmem_free_swap(struct address_space *mapping,
xas_unlock_irq(&xas);
if (nr_pages)
- free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages);
+ swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages);
return nr_pages;
}
@@ -1622,11 +1621,23 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
}
if (split) {
+ int order;
+
try_split:
+ order = folio_order(folio);
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
if (split_folio_to_list(folio, folio_list))
goto redirty;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (order >= HPAGE_PMD_ORDER) {
+ count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
+ count_vm_event(THP_SWPOUT_FALLBACK);
+ }
+#endif
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
+
folio_clear_dirty(folio);
}
@@ -1684,7 +1695,7 @@ try_split:
spin_unlock(&shmem_swaplist_lock);
}
- swap_shmem_alloc(folio->swap, nr_pages);
+ folio_dup_swap(folio, NULL);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
BUG_ON(folio_mapped(folio));
@@ -1705,7 +1716,7 @@ try_split:
/* Swap entry might be erased by racing shmem_free_swap() */
if (!error) {
shmem_recalc_inode(inode, 0, -nr_pages);
- swap_free_nr(folio->swap, nr_pages);
+ folio_put_swap(folio, NULL);
}
/*
@@ -2031,10 +2042,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
swp_entry_t entry, int order, gfp_t gfp)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct folio *new, *swapcache;
int nr_pages = 1 << order;
- struct folio *new;
gfp_t alloc_gfp;
- void *shadow;
/*
* We have arrived here because our zones are constrained, so don't
@@ -2074,34 +2084,19 @@ retry:
goto fallback;
}
- /*
- * Prevent parallel swapin from proceeding with the swap cache flag.
- *
- * Of course there is another possible concurrent scenario as well,
- * that is to say, the swap cache flag of a large folio has already
- * been set by swapcache_prepare(), while another thread may have
- * already split the large swap entry stored in the shmem mapping.
- * In this case, shmem_add_to_page_cache() will help identify the
- * concurrent swapin and return -EEXIST.
- */
- if (swapcache_prepare(entry, nr_pages)) {
+ swapcache = swapin_folio(entry, new);
+ if (swapcache != new) {
folio_put(new);
- new = ERR_PTR(-EEXIST);
- /* Try smaller folio to avoid cache conflict */
- goto fallback;
+ if (!swapcache) {
+ /*
+ * The new folio is charged already, swapin can
+ * only fail due to another raced swapin.
+ */
+ new = ERR_PTR(-EEXIST);
+ goto fallback;
+ }
}
-
- __folio_set_locked(new);
- __folio_set_swapbacked(new);
- new->swap = entry;
-
- memcg1_swapin(entry, nr_pages);
- shadow = swap_cache_get_shadow(entry);
- if (shadow)
- workingset_refault(new, shadow);
- folio_add_lru(new);
- swap_read_folio(new, NULL);
- return new;
+ return swapcache;
fallback:
/* Order 0 swapin failed, nothing to fallback to, abort */
if (!order)
@@ -2191,8 +2186,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
- struct folio *folio, swp_entry_t swap,
- bool skip_swapcache)
+ struct folio *folio, swp_entry_t swap)
{
struct address_space *mapping = inode->i_mapping;
swp_entry_t swapin_error;
@@ -2208,15 +2202,14 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
- if (!skip_swapcache)
- swap_cache_del_folio(folio);
+ folio_put_swap(folio, NULL);
+ swap_cache_del_folio(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
* in shmem_evict_inode().
*/
shmem_recalc_inode(inode, -nr_pages, -nr_pages);
- swap_free_nr(swap, nr_pages);
}
static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
@@ -2309,7 +2302,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
softleaf_t index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
- bool skip_swapcache = false;
int error, nr_pages, order;
pgoff_t offset;
@@ -2352,7 +2344,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
folio = NULL;
goto failed;
}
- skip_swapcache = true;
} else {
/* Cached swapin only supports order 0 folio */
folio = shmem_swapin_cluster(swap, gfp, info, index);
@@ -2408,9 +2399,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
* and swap cache folios are never partially freed.
*/
folio_lock(folio);
- if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
- shmem_confirm_swap(mapping, index, swap) < 0 ||
- folio->swap.val != swap.val) {
+ if (!folio_matches_swap_entry(folio, swap) ||
+ shmem_confirm_swap(mapping, index, swap) < 0) {
error = -EEXIST;
goto unlock;
}
@@ -2442,14 +2432,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- if (skip_swapcache) {
- folio->swap.val = 0;
- swapcache_clear(si, swap, nr_pages);
- } else {
- swap_cache_del_folio(folio);
- }
+ folio_put_swap(folio, NULL);
+ swap_cache_del_folio(folio);
folio_mark_dirty(folio);
- swap_free_nr(swap, nr_pages);
put_swap_device(si);
*foliop = folio;
@@ -2458,14 +2443,11 @@ failed:
if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
- shmem_set_folio_swapin_error(inode, index, folio, swap,
- skip_swapcache);
+ shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
if (folio)
folio_unlock(folio);
failed_nolock:
- if (skip_swapcache)
- swapcache_clear(si, folio->swap, folio_nr_pages(folio));
if (folio)
folio_put(folio);
put_swap_device(si);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 3a4b5207635d..24078ac3e6bc 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -278,8 +278,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
- MAX_RECLAIM_RETRIES),
+ str_yes_no(kswapd_test_hopeless(pgdat)),
K(node_page_state(pgdat, NR_BALLOON_PAGES)));
}
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 20eaee3e97f7..affa64437302 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -70,7 +70,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
memcg_aware ? memcg : NULL,
count_per_node);
if (total) {
- seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+ seq_printf(m, "%llu", mem_cgroup_id(memcg));
for_each_node(nid)
seq_printf(m, " %lu", count_per_node[nid]);
seq_putc(m, '\n');
@@ -106,7 +106,8 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
size_t size, loff_t *pos)
{
struct shrinker *shrinker = file->private_data;
- unsigned long nr_to_scan = 0, ino, read_len;
+ unsigned long nr_to_scan = 0, read_len;
+ u64 id;
struct shrink_control sc = {
.gfp_mask = GFP_KERNEL,
};
@@ -119,7 +120,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return -EFAULT;
kbuf[read_len] = '\0';
- if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3)
+ if (sscanf(kbuf, "%llu %d %lu", &id, &nid, &nr_to_scan) != 3)
return -EINVAL;
if (nid < 0 || nid >= nr_node_ids)
@@ -129,15 +130,15 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return size;
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- memcg = mem_cgroup_get_from_ino(ino);
- if (!memcg || IS_ERR(memcg))
+ memcg = mem_cgroup_get_from_id(id);
+ if (!memcg)
return -ENOENT;
if (!mem_cgroup_online(memcg)) {
mem_cgroup_put(memcg);
return -ENOENT;
}
- } else if (ino != 0) {
+ } else if (id != 0) {
return -EINVAL;
}
diff --git a/mm/slub.c b/mm/slub.c
index 18899017512c..42df791279d9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -8334,7 +8334,8 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
- flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
+ flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU,
+ 0);
WARN_ON(!flushwq);
}
diff --git a/mm/swap.c b/mm/swap.c
index 2260dcd2775e..bb19ccbece46 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -513,7 +513,7 @@ void folio_add_lru(struct folio *folio)
EXPORT_SYMBOL(folio_add_lru);
/**
- * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
+ * folio_add_lru_vma() - Add a folio to the appropriate LRU list for this VMA.
* @folio: The folio to be added to the LRU.
* @vma: VMA in which the folio is mapped.
*
diff --git a/mm/swap.h b/mm/swap.h
index 1bd466da3039..bfafa637c458 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -183,6 +183,33 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
spin_unlock_irq(&ci->lock);
}
+/*
+ * Below are the core routines for doing swap for a folio.
+ * All helpers requires the folio to be locked, and a locked folio
+ * in the swap cache pins the swap entries / slots allocated to the
+ * folio, swap relies heavily on the swap cache and folio lock for
+ * synchronization.
+ *
+ * folio_alloc_swap(): the entry point for a folio to be swapped
+ * out. It allocates swap slots and pins the slots with swap cache.
+ * The slots start with a swap count of zero.
+ *
+ * folio_dup_swap(): increases the swap count of a folio, usually
+ * during it gets unmapped and a swap entry is installed to replace
+ * it (e.g., swap entry in page table). A swap slot with swap
+ * count == 0 should only be increasd by this helper.
+ *
+ * folio_put_swap(): does the opposite thing of folio_dup_swap().
+ */
+int folio_alloc_swap(struct folio *folio);
+int folio_dup_swap(struct folio *folio, struct page *subpage);
+void folio_put_swap(struct folio *folio, struct page *subpage);
+
+/* For internal use */
+extern void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset, unsigned int nr_pages);
+
/* linux/mm/page_io.c */
int sio_pool_init(void);
struct swap_iocb;
@@ -236,7 +263,7 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
/*
* All swap cache helpers below require the caller to ensure the swap entries
- * used are valid and stablize the device by any of the following ways:
+ * used are valid and stabilize the device by any of the following ways:
* - Hold a reference by get_swap_device(): this ensures a single entry is
* valid and increases the swap device's refcount.
* - Locking a folio in the swap cache: this ensures the folio's swap entries
@@ -245,11 +272,16 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
* swap entries in the page table, similar to locking swap cache folio.
* - See the comment of get_swap_device() for more complex usage.
*/
+bool swap_cache_has_folio(swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry);
void *swap_cache_get_shadow(swp_entry_t entry);
-void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
void swap_cache_del_folio(struct folio *folio);
+struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags,
+ struct mempolicy *mpol, pgoff_t ilx,
+ bool *alloced);
/* Below helpers require the caller to lock and pass in the swap cluster. */
+void __swap_cache_add_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry);
void __swap_cache_del_folio(struct swap_cluster_info *ci,
struct folio *folio, swp_entry_t entry, void *shadow);
void __swap_cache_replace_folio(struct swap_cluster_info *ci,
@@ -261,13 +293,11 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug);
-struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags,
- struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
- bool skip_if_exists);
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
+struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr);
@@ -303,8 +333,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
{
- struct swap_info_struct *si = __swap_entry_to_info(entry);
- pgoff_t offset = swp_offset(entry);
int i;
/*
@@ -313,8 +341,9 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
* be in conflict with the folio in swap cache.
*/
for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ if (swap_cache_has_folio(entry))
return i;
+ entry.val++;
}
return i;
@@ -353,9 +382,24 @@ static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
return NULL;
}
+static inline int folio_alloc_swap(struct folio *folio)
+{
+ return -EINVAL;
+}
+
+static inline int folio_dup_swap(struct folio *folio, struct page *page)
+{
+ return -EINVAL;
+}
+
+static inline void folio_put_swap(struct folio *folio, struct page *page)
+{
+}
+
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
{
}
+
static inline void swap_write_unplug(struct swap_iocb *sio)
{
}
@@ -386,6 +430,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
+static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+{
+ return NULL;
+}
+
static inline void swap_update_readahead(struct folio *folio,
struct vm_area_struct *vma, unsigned long addr)
{
@@ -397,8 +446,9 @@ static inline int swap_writeout(struct folio *folio,
return 0;
}
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
+static inline bool swap_cache_has_folio(swp_entry_t entry)
{
+ return false;
}
static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
@@ -411,10 +461,6 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry)
return NULL;
}
-static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
-{
-}
-
static inline void swap_cache_del_folio(struct folio *folio)
{
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 44d228982521..6d0eef7470be 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -81,7 +81,7 @@ void show_swap_cache_info(void)
* Context: Caller must ensure @entry is valid and protect the swap device
* with reference count or locks.
* Return: Returns the found folio on success, NULL otherwise. The caller
- * must lock nd check if the folio still matches the swap entry before
+ * must lock and check if the folio still matches the swap entry before
* use (e.g., folio_matches_swap_entry).
*/
struct folio *swap_cache_get_folio(swp_entry_t entry)
@@ -103,6 +103,22 @@ struct folio *swap_cache_get_folio(swp_entry_t entry)
}
/**
+ * swap_cache_has_folio - Check if a swap slot has cache.
+ * @entry: swap entry indicating the slot.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap
+ * device with reference count or locks.
+ */
+bool swap_cache_has_folio(swp_entry_t entry)
+{
+ unsigned long swp_tb;
+
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ return swp_tb_is_folio(swp_tb);
+}
+
+/**
* swap_cache_get_shadow - Looks up a shadow in the swap cache.
* @entry: swap entry used for the lookup.
*
@@ -121,6 +137,34 @@ void *swap_cache_get_shadow(swp_entry_t entry)
return NULL;
}
+void __swap_cache_add_folio(struct swap_cluster_info *ci,
+ struct folio *folio, swp_entry_t entry)
+{
+ unsigned long new_tb;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+
+ new_tb = folio_to_swp_tb(folio);
+ ci_start = swp_cluster_offset(entry);
+ ci_off = ci_start;
+ ci_end = ci_start + nr_pages;
+ do {
+ VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off)));
+ __swap_table_set(ci, ci_off, new_tb);
+ } while (++ci_off < ci_end);
+
+ folio_ref_add(folio, nr_pages);
+ folio_set_swapcache(folio);
+ folio->swap = entry;
+
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
+}
+
/**
* swap_cache_add_folio - Add a folio into the swap cache.
* @folio: The folio to be added.
@@ -130,43 +174,51 @@ void *swap_cache_get_shadow(swp_entry_t entry)
*
* Context: Caller must ensure @entry is valid and protect the swap device
* with reference count or locks.
- * The caller also needs to update the corresponding swap_map slots with
- * SWAP_HAS_CACHE bit to avoid race or conflict.
*/
-void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
+static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
+ void **shadowp)
{
+ int err;
void *shadow = NULL;
- unsigned long old_tb, new_tb;
+ unsigned long old_tb;
+ struct swap_info_struct *si;
struct swap_cluster_info *ci;
- unsigned int ci_start, ci_off, ci_end;
+ unsigned int ci_start, ci_off, ci_end, offset;
unsigned long nr_pages = folio_nr_pages(folio);
- VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
- VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
- VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
-
- new_tb = folio_to_swp_tb(folio);
+ si = __swap_entry_to_info(entry);
ci_start = swp_cluster_offset(entry);
ci_end = ci_start + nr_pages;
ci_off = ci_start;
- ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ offset = swp_offset(entry);
+ ci = swap_cluster_lock(si, swp_offset(entry));
+ if (unlikely(!ci->table)) {
+ err = -ENOENT;
+ goto failed;
+ }
do {
- old_tb = __swap_table_xchg(ci, ci_off, new_tb);
- WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+ old_tb = __swap_table_get(ci, ci_off);
+ if (unlikely(swp_tb_is_folio(old_tb))) {
+ err = -EEXIST;
+ goto failed;
+ }
+ if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
+ err = -ENOENT;
+ goto failed;
+ }
if (swp_tb_is_shadow(old_tb))
shadow = swp_tb_to_shadow(old_tb);
+ offset++;
} while (++ci_off < ci_end);
-
- folio_ref_add(folio, nr_pages);
- folio_set_swapcache(folio);
- folio->swap = entry;
+ __swap_cache_add_folio(ci, folio, entry);
swap_cluster_unlock(ci);
-
- node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
- lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
-
if (shadowp)
*shadowp = shadow;
+ return 0;
+
+failed:
+ swap_cluster_unlock(ci);
+ return err;
}
/**
@@ -185,8 +237,10 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp
void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
swp_entry_t entry, void *shadow)
{
+ struct swap_info_struct *si;
unsigned long old_tb, new_tb;
unsigned int ci_start, ci_off, ci_end;
+ bool folio_swapped = false, need_free = false;
unsigned long nr_pages = folio_nr_pages(folio);
VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
@@ -194,6 +248,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
+ si = __swap_entry_to_info(entry);
new_tb = shadow_swp_to_tb(shadow);
ci_start = swp_cluster_offset(entry);
ci_end = ci_start + nr_pages;
@@ -203,12 +258,27 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
old_tb = __swap_table_xchg(ci, ci_off, new_tb);
WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
swp_tb_to_folio(old_tb) != folio);
+ if (__swap_count(swp_entry(si->type,
+ swp_offset(entry) + ci_off - ci_start)))
+ folio_swapped = true;
+ else
+ need_free = true;
} while (++ci_off < ci_end);
folio->swap.val = 0;
folio_clear_swapcache(folio);
node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
+
+ if (!folio_swapped) {
+ swap_entries_free(si, ci, swp_offset(entry), nr_pages);
+ } else if (need_free) {
+ do {
+ if (!__swap_count(entry))
+ swap_entries_free(si, ci, swp_offset(entry), 1);
+ entry.val++;
+ } while (--nr_pages);
+ }
}
/**
@@ -230,7 +300,6 @@ void swap_cache_del_folio(struct folio *folio)
__swap_cache_del_folio(ci, folio, entry, NULL);
swap_cluster_unlock(ci);
- put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
@@ -283,7 +352,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
}
/**
- * swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
+ * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
* @entry: The starting index entry.
* @nr_ents: How many slots need to be cleared.
*
@@ -401,108 +470,143 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
}
}
-struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
- bool skip_if_exists)
+/**
+ * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache.
+ * @entry: swap entry to be bound to the folio.
+ * @folio: folio to be added.
+ * @gfp: memory allocation flags for charge, can be 0 if @charged if true.
+ * @charged: if the folio is already charged.
+ *
+ * Update the swap_map and add folio as swap cache, typically before swapin.
+ * All swap slots covered by the folio must have a non-zero swap count.
+ *
+ * Context: Caller must protect the swap device with reference count or locks.
+ * Return: Returns the folio being added on success. Returns the existing folio
+ * if @entry is already cached. Returns NULL if raced with swapin or swapoff.
+ */
+static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
+ struct folio *folio,
+ gfp_t gfp, bool charged)
{
- struct swap_info_struct *si = __swap_entry_to_info(entry);
- struct folio *folio;
- struct folio *new_folio = NULL;
- struct folio *result = NULL;
- void *shadow = NULL;
+ struct folio *swapcache = NULL;
+ void *shadow;
+ int ret;
- *new_page_allocated = false;
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
for (;;) {
- int err;
+ ret = swap_cache_add_folio(folio, entry, &shadow);
+ if (!ret)
+ break;
/*
- * Check the swap cache first, if a cached folio is found,
- * return it unlocked. The caller will lock and check it.
+ * Large order allocation needs special handling on
+ * race: if a smaller folio exists in cache, swapin needs
+ * to fallback to order 0, and doing a swap cache lookup
+ * might return a folio that is irrelevant to the faulting
+ * entry because @entry is aligned down. Just return NULL.
*/
- folio = swap_cache_get_folio(entry);
- if (folio)
- goto got_folio;
+ if (ret != -EEXIST || folio_test_large(folio))
+ goto failed;
- /*
- * Just skip read ahead for unused swap slot.
- */
- if (!swap_entry_swapped(si, entry))
- goto put_and_return;
+ swapcache = swap_cache_get_folio(entry);
+ if (swapcache)
+ goto failed;
+ }
- /*
- * Get a new folio to read into from swap. Allocate it now if
- * new_folio not exist, before marking swap_map SWAP_HAS_CACHE,
- * when -EEXIST will cause any racers to loop around until we
- * add it to cache.
- */
- if (!new_folio) {
- new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
- if (!new_folio)
- goto put_and_return;
- }
+ if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) {
+ swap_cache_del_folio(folio);
+ goto failed;
+ }
- /*
- * Swap entry may have been freed since our caller observed it.
- */
- err = swapcache_prepare(entry, 1);
- if (!err)
- break;
- else if (err != -EEXIST)
- goto put_and_return;
+ memcg1_swapin(entry, folio_nr_pages(folio));
+ if (shadow)
+ workingset_refault(folio, shadow);
- /*
- * Protect against a recursive call to __read_swap_cache_async()
- * on the same entry waiting forever here because SWAP_HAS_CACHE
- * is set but the folio is not the swap cache yet. This can
- * happen today if mem_cgroup_swapin_charge_folio() below
- * triggers reclaim through zswap, which may call
- * __read_swap_cache_async() in the writeback path.
- */
- if (skip_if_exists)
- goto put_and_return;
+ /* Caller will initiate read into locked folio */
+ folio_add_lru(folio);
+ return folio;
- /*
- * We might race against __swap_cache_del_folio(), and
- * stumble across a swap_map entry whose SWAP_HAS_CACHE
- * has not yet been cleared. Or race against another
- * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
- * in swap_map, but not yet added its folio to swap cache.
- */
- schedule_timeout_uninterruptible(1);
- }
+failed:
+ folio_unlock(folio);
+ return swapcache;
+}
- /*
- * The swap entry is ours to swap in. Prepare the new folio.
- */
- __folio_set_locked(new_folio);
- __folio_set_swapbacked(new_folio);
+/**
+ * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache.
+ * @entry: the swapped out swap entry to be binded to the folio.
+ * @gfp_mask: memory allocation flags
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
+ * @new_page_allocated: sets true if allocation happened, false otherwise
+ *
+ * Allocate a folio in the swap cache for one swap slot, typically before
+ * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by
+ * @entry must have a non-zero swap count (swapped out).
+ * Currently only supports order 0.
+ *
+ * Context: Caller must protect the swap device with reference count or locks.
+ * Return: Returns the existing folio if @entry is cached already. Returns
+ * NULL if failed due to -ENOMEM or @entry have a swap count < 1.
+ */
+struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t ilx,
+ bool *new_page_allocated)
+{
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
+ struct folio *folio;
+ struct folio *result = NULL;
- if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
- goto fail_unlock;
+ *new_page_allocated = false;
+ /* Check the swap cache again for readahead path. */
+ folio = swap_cache_get_folio(entry);
+ if (folio)
+ return folio;
- swap_cache_add_folio(new_folio, entry, &shadow);
- memcg1_swapin(entry, 1);
+ /* Skip allocation for unused and bad swap slot for readahead. */
+ if (!swap_entry_swapped(si, entry))
+ return NULL;
- if (shadow)
- workingset_refault(new_folio, shadow);
-
- /* Caller will initiate read into locked new_folio */
- folio_add_lru(new_folio);
- *new_page_allocated = true;
- folio = new_folio;
-got_folio:
- result = folio;
- goto put_and_return;
-
-fail_unlock:
- put_swap_folio(new_folio, entry);
- folio_unlock(new_folio);
-put_and_return:
- if (!(*new_page_allocated) && new_folio)
- folio_put(new_folio);
+ /* Allocate a new folio to be added into the swap cache. */
+ folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
+ if (!folio)
+ return NULL;
+ /* Try add the new folio, returns existing folio or NULL on failure. */
+ result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false);
+ if (result == folio)
+ *new_page_allocated = true;
+ else
+ folio_put(folio);
return result;
}
+/**
+ * swapin_folio - swap-in one or multiple entries skipping readahead.
+ * @entry: starting swap entry to swap in
+ * @folio: a new allocated and charged folio
+ *
+ * Reads @entry into @folio, @folio will be added to the swap cache.
+ * If @folio is a large folio, the @entry will be rounded down to align
+ * with the folio size.
+ *
+ * Return: returns pointer to @folio on success. If folio is a large folio
+ * and this raced with another swapin, NULL will be returned to allow fallback
+ * to order 0. Else, if another folio was already added to the swap cache,
+ * return that swap cache folio instead.
+ */
+struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
+{
+ struct folio *swapcache;
+ pgoff_t offset = swp_offset(entry);
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
+ swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
+ if (swapcache == folio)
+ swap_read_folio(folio, NULL);
+ return swapcache;
+}
+
/*
* Locate a page of swap in physical memory, reserving swap cache space
* and reading the disk if it is not already cached.
@@ -524,8 +628,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return NULL;
mpol = get_vma_policy(vma, addr, 0, &ilx);
- folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
mpol_cond_put(mpol);
if (page_allocated)
@@ -642,9 +746,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
- folio = __read_swap_cache_async(
- swp_entry(swp_type(entry), offset),
- gfp_mask, mpol, ilx, &page_allocated, false);
+ folio = swap_cache_alloc_folio(
+ swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx,
+ &page_allocated);
if (!folio)
continue;
if (page_allocated) {
@@ -661,8 +765,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
/* The page was likely read above, so no need for plugging here */
- folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
if (unlikely(page_allocated))
swap_read_folio(folio, NULL);
return folio;
@@ -766,8 +870,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (!si)
continue;
}
- folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
- &page_allocated, false);
+ folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
if (si)
put_swap_device(si);
if (!folio)
@@ -788,8 +892,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
lru_add_drain();
skip:
/* The folio was likely read above, so no need for plugging here */
- folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
- &page_allocated, false);
+ folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx,
+ &page_allocated);
if (unlikely(page_allocated))
swap_read_folio(folio, NULL);
return folio;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 25120cf7c480..c2377c4b6bb9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -48,16 +48,18 @@
#include <linux/swap_cgroup.h>
#include "swap_table.h"
#include "internal.h"
+#include "swap_table.h"
#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entries_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
+static void swap_put_entry_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset);
static bool folio_swapcache_freeable(struct folio *folio);
static void move_cluster(struct swap_info_struct *si,
struct swap_cluster_info *ci, struct list_head *list,
@@ -81,9 +83,7 @@ bool swap_migration_ad_supported;
#endif /* CONFIG_MIGRATION */
static const char Bad_file[] = "Bad swap file entry ";
-static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
-static const char Unused_offset[] = "Unused swap offset entry ";
/*
* all active swap_info_structs
@@ -144,11 +144,6 @@ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
return swap_type_to_info(swp_type(entry));
}
-static inline unsigned char swap_count(unsigned char ent)
-{
- return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
-}
-
/*
* Use the second highest bit of inuse_pages counter as the indicator
* if one swap device is on the available plist, so the atomic can
@@ -180,39 +175,25 @@ static long swap_usage_in_pages(struct swap_info_struct *si)
#define TTRS_FULL 0x4
static bool swap_only_has_cache(struct swap_info_struct *si,
- unsigned long offset, int nr_pages)
+ struct swap_cluster_info *ci,
+ unsigned long offset, int nr_pages)
{
+ unsigned int ci_off = offset % SWAPFILE_CLUSTER;
unsigned char *map = si->swap_map + offset;
unsigned char *map_end = map + nr_pages;
+ unsigned long swp_tb;
do {
- VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
- if (*map != SWAP_HAS_CACHE)
+ swp_tb = __swap_table_get(ci, ci_off);
+ VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
+ if (*map)
return false;
+ ++ci_off;
} while (++map < map_end);
return true;
}
-static bool swap_is_last_map(struct swap_info_struct *si,
- unsigned long offset, int nr_pages, bool *has_cache)
-{
- unsigned char *map = si->swap_map + offset;
- unsigned char *map_end = map + nr_pages;
- unsigned char count = *map;
-
- if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
- return false;
-
- while (++map < map_end) {
- if (*map != count)
- return false;
- }
-
- *has_cache = !!(count & SWAP_HAS_CACHE);
- return true;
-}
-
/*
* returns number of pages in the folio that backs the swap entry. If positive,
* the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -262,12 +243,12 @@ again:
goto out_unlock;
/*
- * It's safe to delete the folio from swap cache only if the folio's
- * swap_map is HAS_CACHE only, which means the slots have no page table
+ * It's safe to delete the folio from swap cache only if the folio
+ * is in swap cache with swap count == 0. The slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
ci = swap_cluster_lock(si, offset);
- need_reclaim = swap_only_has_cache(si, offset, nr_pages);
+ need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
swap_cluster_unlock(ci);
if (!need_reclaim)
goto out_unlock;
@@ -777,68 +758,84 @@ static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info,
return 0;
}
+/*
+ * Reclaim drops the ci lock, so the cluster may become unusable (freed or
+ * stolen by a lower order). @usable will be set to false if that happens.
+ */
static bool cluster_reclaim_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned int order,
+ bool *usable)
{
+ unsigned int nr_pages = 1 << order;
+ unsigned long offset = start, end = start + nr_pages;
unsigned char *map = si->swap_map;
- unsigned long offset = start;
- int nr_reclaim;
+ unsigned long swp_tb;
spin_unlock(&ci->lock);
do {
- switch (READ_ONCE(map[offset])) {
- case 0:
- offset++;
- break;
- case SWAP_HAS_CACHE:
- nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
- if (nr_reclaim > 0)
- offset += nr_reclaim;
- else
- goto out;
+ if (READ_ONCE(map[offset]))
break;
- default:
- goto out;
+ swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+ if (swp_tb_is_folio(swp_tb)) {
+ if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
+ break;
}
- } while (offset < end);
-out:
+ } while (++offset < end);
spin_lock(&ci->lock);
+
+ /*
+ * We just dropped ci->lock so cluster could be used by another
+ * order or got freed, check if it's still usable or empty.
+ */
+ if (!cluster_is_usable(ci, order)) {
+ *usable = false;
+ return false;
+ }
+ *usable = true;
+
+ /* Fast path, no need to scan if the whole cluster is empty */
+ if (cluster_is_empty(ci))
+ return true;
+
/*
* Recheck the range no matter reclaim succeeded or not, the slot
* could have been be freed while we are not holding the lock.
*/
- for (offset = start; offset < end; offset++)
- if (READ_ONCE(map[offset]))
+ for (offset = start; offset < end; offset++) {
+ swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+ if (map[offset] || !swp_tb_is_null(swp_tb))
return false;
+ }
return true;
}
static bool cluster_scan_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long start, unsigned int nr_pages,
+ unsigned long offset, unsigned int nr_pages,
bool *need_reclaim)
{
- unsigned long offset, end = start + nr_pages;
+ unsigned long end = offset + nr_pages;
unsigned char *map = si->swap_map;
+ unsigned long swp_tb;
if (cluster_is_empty(ci))
return true;
- for (offset = start; offset < end; offset++) {
- switch (READ_ONCE(map[offset])) {
- case 0:
- continue;
- case SWAP_HAS_CACHE:
+ do {
+ if (map[offset])
+ return false;
+ swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+ if (swp_tb_is_folio(swp_tb)) {
if (!vm_swap_full())
return false;
*need_reclaim = true;
- continue;
- default:
- return false;
+ } else {
+ /* A entry with no count and no cache must be null */
+ VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
}
- }
+ } while (++offset < end);
return true;
}
@@ -863,11 +860,13 @@ static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
}
}
-static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
- unsigned int start, unsigned char usage,
- unsigned int order)
+static bool cluster_alloc_range(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ struct folio *folio,
+ unsigned int offset)
{
- unsigned int nr_pages = 1 << order;
+ unsigned long nr_pages;
+ unsigned int order;
lockdep_assert_held(&ci->lock);
@@ -875,16 +874,38 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
return false;
/*
+ * All mm swap allocation starts with a folio (folio_alloc_swap),
+ * it's also the only allocation path for large orders allocation.
+ * Such swap slots starts with count == 0 and will be increased
+ * upon folio unmap.
+ *
+ * Else, it's a exclusive order 0 allocation for hibernation.
+ * The slot starts with count == 1 and never increases.
+ */
+ if (likely(folio)) {
+ order = folio_order(folio);
+ nr_pages = 1 << order;
+ __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
+ } else if (IS_ENABLED(CONFIG_HIBERNATION)) {
+ order = 0;
+ nr_pages = 1;
+ WARN_ON_ONCE(si->swap_map[offset]);
+ si->swap_map[offset] = 1;
+ swap_cluster_assert_table_empty(ci, offset, 1);
+ } else {
+ /* Allocation without folio is only possible with hibernation */
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
* The first allocation in a cluster makes the
* cluster exclusive to this order
*/
if (cluster_is_empty(ci))
ci->order = order;
-
- memset(si->swap_map + start, usage, nr_pages);
- swap_cluster_assert_table_empty(ci, start, nr_pages);
- swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
+ swap_range_alloc(si, nr_pages);
return true;
}
@@ -892,17 +913,17 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
/* Try use a new cluster for current CPU and allocate from it. */
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long offset,
- unsigned int order,
- unsigned char usage)
+ struct folio *folio, unsigned long offset)
{
unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
+ unsigned int order = likely(folio) ? folio_order(folio) : 0;
unsigned int nr_pages = 1 << order;
- bool need_reclaim, ret;
+ bool need_reclaim, ret, usable;
lockdep_assert_held(&ci->lock);
+ VM_WARN_ON(!cluster_is_usable(ci, order));
if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
goto out;
@@ -912,14 +933,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
continue;
if (need_reclaim) {
- ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages);
- /*
- * Reclaim drops ci->lock and cluster could be used
- * by another order. Not checking flag as off-list
- * cluster has no flag set, and change of list
- * won't cause fragmentation.
- */
- if (!cluster_is_usable(ci, order))
+ ret = cluster_reclaim_range(si, ci, offset, order, &usable);
+ if (!usable)
goto out;
if (cluster_is_empty(ci))
offset = start;
@@ -927,7 +942,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
if (!ret)
continue;
}
- if (!cluster_alloc_range(si, ci, offset, usage, order))
+ if (!cluster_alloc_range(si, ci, folio, offset))
break;
found = offset;
offset += nr_pages;
@@ -949,8 +964,7 @@ out:
static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
struct list_head *list,
- unsigned int order,
- unsigned char usage,
+ struct folio *folio,
bool scan_all)
{
unsigned int found = SWAP_ENTRY_INVALID;
@@ -962,7 +976,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
if (!ci)
break;
offset = cluster_offset(si, ci);
- found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, offset);
if (found)
break;
} while (scan_all);
@@ -987,7 +1001,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
to_scan--;
while (offset < end) {
- if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+ if (!READ_ONCE(map[offset]) &&
+ swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
spin_unlock(&ci->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
TTRS_ANYWAY);
@@ -1023,10 +1038,11 @@ static void swap_reclaim_work(struct work_struct *work)
* Try to allocate swap entries with specified order and try set a new
* cluster for current CPU too.
*/
-static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
- unsigned char usage)
+static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
+ struct folio *folio)
{
struct swap_cluster_info *ci;
+ unsigned int order = likely(folio) ? folio_order(folio) : 0;
unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
/*
@@ -1048,8 +1064,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
- found = alloc_swap_scan_cluster(si, ci, offset,
- order, usage);
+ found = alloc_swap_scan_cluster(si, ci, folio, offset);
} else {
swap_cluster_unlock(ci);
}
@@ -1063,22 +1078,19 @@ new_cluster:
* to spread out the writes.
*/
if (si->flags & SWP_PAGE_DISCARD) {
- found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
- false);
+ found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
if (found)
goto done;
}
if (order < PMD_ORDER) {
- found = alloc_swap_scan_list(si, &si->nonfull_clusters[order],
- order, usage, true);
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true);
if (found)
goto done;
}
if (!(si->flags & SWP_PAGE_DISCARD)) {
- found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
- false);
+ found = alloc_swap_scan_list(si, &si->free_clusters, folio, false);
if (found)
goto done;
}
@@ -1092,10 +1104,9 @@ new_cluster:
* Scan only one fragment cluster is good enough. Order 0
* allocation will surely success, and large allocation
* failure is not critical. Scanning one cluster still
- * keeps the list rotated and reclaimed (for HAS_CACHE).
+ * keeps the list rotated and reclaimed (for clean swap cache).
*/
- found = alloc_swap_scan_list(si, &si->frag_clusters[order], order,
- usage, false);
+ found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false);
if (found)
goto done;
}
@@ -1109,13 +1120,11 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- found = alloc_swap_scan_list(si, &si->frag_clusters[o],
- 0, usage, true);
+ found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true);
if (found)
goto done;
- found = alloc_swap_scan_list(si, &si->nonfull_clusters[o],
- 0, usage, true);
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true);
if (found)
goto done;
}
@@ -1306,12 +1315,12 @@ static bool get_swap_device_info(struct swap_info_struct *si)
* Fast path try to get swap entries with specified order from current
* CPU's swap entry pool (a cluster).
*/
-static bool swap_alloc_fast(swp_entry_t *entry,
- int order)
+static bool swap_alloc_fast(struct folio *folio)
{
+ unsigned int order = folio_order(folio);
struct swap_cluster_info *ci;
struct swap_info_struct *si;
- unsigned int offset, found = SWAP_ENTRY_INVALID;
+ unsigned int offset;
/*
* Once allocated, swap_info_struct will never be completely freed,
@@ -1326,22 +1335,18 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
- found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
- if (found)
- *entry = swp_entry(si->type, found);
+ alloc_swap_scan_cluster(si, ci, folio, offset);
} else {
swap_cluster_unlock(ci);
}
put_swap_device(si);
- return !!found;
+ return folio_test_swapcache(folio);
}
/* Rotate the device and switch to a new cluster */
-static void swap_alloc_slow(swp_entry_t *entry,
- int order)
+static void swap_alloc_slow(struct folio *folio)
{
- unsigned long offset;
struct swap_info_struct *si, *next;
spin_lock(&swap_avail_lock);
@@ -1351,13 +1356,11 @@ start_over:
plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
- offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+ cluster_alloc_swap_entry(si, folio);
put_swap_device(si);
- if (offset) {
- *entry = swp_entry(si->type, offset);
+ if (folio_test_swapcache(folio))
return;
- }
- if (order)
+ if (folio_test_large(folio))
return;
}
@@ -1409,6 +1412,75 @@ start_over:
}
/**
+ * swap_put_entries_cluster - Decrease the swap count of a set of slots.
+ * @si: The swap device.
+ * @start: start offset of slots.
+ * @nr: number of slots.
+ * @reclaim_cache: if true, also reclaim the swap cache.
+ *
+ * This helper decreases the swap count of a set of slots and tries to
+ * batch free them. Also reclaims the swap cache if @reclaim_cache is true.
+ * Context: The caller must ensure that all slots belong to the same
+ * cluster and their swap count doesn't go underflow.
+ */
+static void swap_put_entries_cluster(struct swap_info_struct *si,
+ unsigned long start, int nr,
+ bool reclaim_cache)
+{
+ unsigned long offset = start, end = start + nr;
+ unsigned long batch_start = SWAP_ENTRY_INVALID;
+ struct swap_cluster_info *ci;
+ bool need_reclaim = false;
+ unsigned int nr_reclaimed;
+ unsigned long swp_tb;
+ unsigned int count;
+
+ ci = swap_cluster_lock(si, offset);
+ do {
+ swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+ count = si->swap_map[offset];
+ VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
+ if (count == 1) {
+ /* count == 1 and non-cached slots will be batch freed. */
+ if (!swp_tb_is_folio(swp_tb)) {
+ if (!batch_start)
+ batch_start = offset;
+ continue;
+ }
+ /* count will be 0 after put, slot can be reclaimed */
+ need_reclaim = true;
+ }
+ /*
+ * A count != 1 or cached slot can't be freed. Put its swap
+ * count and then free the interrupted pending batch. Cached
+ * slots will be freed when folio is removed from swap cache
+ * (__swap_cache_del_folio).
+ */
+ swap_put_entry_locked(si, ci, offset);
+ if (batch_start) {
+ swap_entries_free(si, ci, batch_start, offset - batch_start);
+ batch_start = SWAP_ENTRY_INVALID;
+ }
+ } while (++offset < end);
+
+ if (batch_start)
+ swap_entries_free(si, ci, batch_start, offset - batch_start);
+ swap_cluster_unlock(ci);
+
+ if (!need_reclaim || !reclaim_cache)
+ return;
+
+ offset = start;
+ do {
+ nr_reclaimed = __try_to_reclaim_swap(si, offset,
+ TTRS_UNMAPPED | TTRS_FULL);
+ offset++;
+ if (nr_reclaimed)
+ offset = round_up(offset, abs(nr_reclaimed));
+ } while (offset < end);
+}
+
+/**
* folio_alloc_swap - allocate swap space for a folio
* @folio: folio we want to move to swap
*
@@ -1422,7 +1494,6 @@ int folio_alloc_swap(struct folio *folio)
{
unsigned int order = folio_order(folio);
unsigned int size = 1 << order;
- swp_entry_t entry = {};
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
@@ -1447,89 +1518,94 @@ int folio_alloc_swap(struct folio *folio)
again:
local_lock(&percpu_swap_cluster.lock);
- if (!swap_alloc_fast(&entry, order))
- swap_alloc_slow(&entry, order);
+ if (!swap_alloc_fast(folio))
+ swap_alloc_slow(folio);
local_unlock(&percpu_swap_cluster.lock);
- if (unlikely(!order && !entry.val)) {
+ if (!order && unlikely(!folio_test_swapcache(folio))) {
if (swap_sync_discard())
goto again;
}
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
- if (mem_cgroup_try_charge_swap(folio, entry))
- goto out_free;
+ if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap)))
+ swap_cache_del_folio(folio);
- if (!entry.val)
+ if (unlikely(!folio_test_swapcache(folio)))
return -ENOMEM;
- swap_cache_add_folio(folio, entry, NULL);
-
return 0;
+}
+
+/**
+ * folio_dup_swap() - Increase swap count of swap entries of a folio.
+ * @folio: folio with swap entries bounded.
+ * @subpage: if not NULL, only increase the swap count of this subpage.
+ *
+ * Typically called when the folio is unmapped and have its swap entry to
+ * take its palce.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ * NOTE: The caller also has to ensure there is no raced call to
+ * swap_put_entries_direct on its swap entry before this helper returns, or
+ * the swap map may underflow. Currently, we only accept @subpage == NULL
+ * for shmem due to the limitation of swap continuation: shmem always
+ * duplicates the swap entry only once, so there is no such issue for it.
+ */
+int folio_dup_swap(struct folio *folio, struct page *subpage)
+{
+ int err = 0;
+ swp_entry_t entry = folio->swap;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);
+
+ if (subpage) {
+ entry.val += folio_page_idx(folio, subpage);
+ nr_pages = 1;
+ }
+
+ while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM)
+ err = add_swap_count_continuation(entry, GFP_ATOMIC);
-out_free:
- put_swap_folio(folio, entry);
- return -ENOMEM;
+ return err;
}
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+/**
+ * folio_put_swap() - Decrease swap count of swap entries of a folio.
+ * @folio: folio with swap entries bounded, must be in swap cache and locked.
+ * @subpage: if not NULL, only decrease the swap count of this subpage.
+ *
+ * This won't free the swap slots even if swap count drops to zero, they are
+ * still pinned by the swap cache. User may call folio_free_swap to free them.
+ * Context: Caller must ensure the folio is locked and in the swap cache.
+ */
+void folio_put_swap(struct folio *folio, struct page *subpage)
{
- struct swap_info_struct *si;
- unsigned long offset;
+ swp_entry_t entry = folio->swap;
+ unsigned long nr_pages = folio_nr_pages(folio);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
- if (!entry.val)
- goto out;
- si = swap_entry_to_info(entry);
- if (!si)
- goto bad_nofile;
- if (data_race(!(si->flags & SWP_USED)))
- goto bad_device;
- offset = swp_offset(entry);
- if (offset >= si->max)
- goto bad_offset;
- if (data_race(!si->swap_map[swp_offset(entry)]))
- goto bad_free;
- return si;
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio);
-bad_free:
- pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
- goto out;
-bad_offset:
- pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
- goto out;
-bad_device:
- pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
- goto out;
-bad_nofile:
- pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
-out:
- return NULL;
+ if (subpage) {
+ entry.val += folio_page_idx(folio, subpage);
+ nr_pages = 1;
+ }
+
+ swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
}
-static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry,
- unsigned char usage)
+static void swap_put_entry_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset)
{
- unsigned long offset = swp_offset(entry);
unsigned char count;
- unsigned char has_cache;
count = si->swap_map[offset];
-
- has_cache = count & SWAP_HAS_CACHE;
- count &= ~SWAP_HAS_CACHE;
-
- if (usage == SWAP_HAS_CACHE) {
- VM_BUG_ON(!has_cache);
- has_cache = 0;
- } else if (count == SWAP_MAP_SHMEM) {
- /*
- * Or we could insist on shmem.c using a special
- * swap_shmem_free() and free_shmem_swap_and_cache()...
- */
- count = 0;
- } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+ if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
if (count == COUNT_CONTINUED) {
if (swap_count_continued(si, offset, count))
count = SWAP_MAP_MAX | COUNT_CONTINUED;
@@ -1539,13 +1615,9 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
count--;
}
- usage = count | has_cache;
- if (usage)
- WRITE_ONCE(si->swap_map[offset], usage);
- else
- swap_entries_free(si, ci, entry, 1);
-
- return usage;
+ WRITE_ONCE(si->swap_map[offset], count);
+ if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
+ swap_entries_free(si, ci, offset, 1);
}
/*
@@ -1574,10 +1646,9 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
* CPU1 CPU2
* do_swap_page()
* ... swapoff+swapon
- * __read_swap_cache_async()
- * swapcache_prepare()
- * __swap_duplicate()
- * // check swap_map
+ * swap_cache_alloc_folio()
+ * swap_cache_add_folio()
+ * // check swap_map
* // verify PTE not changed
*
* In __swap_duplicate(), the swap_map need to be checked before
@@ -1614,105 +1685,15 @@ put_out:
return NULL;
}
-static void swap_entries_put_cache(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
-{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
-
- ci = swap_cluster_lock(si, offset);
- if (swap_only_has_cache(si, offset, nr)) {
- swap_entries_free(si, ci, entry, nr);
- } else {
- for (int i = 0; i < nr; i++, entry.val++)
- swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
- }
- swap_cluster_unlock(ci);
-}
-
-static bool swap_entries_put_map(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
-{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
- bool has_cache = false;
- unsigned char count;
- int i;
-
- if (nr <= 1)
- goto fallback;
- count = swap_count(data_race(si->swap_map[offset]));
- if (count != 1 && count != SWAP_MAP_SHMEM)
- goto fallback;
-
- ci = swap_cluster_lock(si, offset);
- if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- goto locked_fallback;
- }
- if (!has_cache)
- swap_entries_free(si, ci, entry, nr);
- else
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
- swap_cluster_unlock(ci);
-
- return has_cache;
-
-fallback:
- ci = swap_cluster_lock(si, offset);
-locked_fallback:
- for (i = 0; i < nr; i++, entry.val++) {
- count = swap_entry_put_locked(si, ci, entry, 1);
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- }
- swap_cluster_unlock(ci);
- return has_cache;
-}
-
-/*
- * Only functions with "_nr" suffix are able to free entries spanning
- * cross multi clusters, so ensure the range is within a single cluster
- * when freeing entries with functions without "_nr" suffix.
- */
-static bool swap_entries_put_map_nr(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
-{
- int cluster_nr, cluster_rest;
- unsigned long offset = swp_offset(entry);
- bool has_cache = false;
-
- cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
- while (nr) {
- cluster_nr = min(nr, cluster_rest);
- has_cache |= swap_entries_put_map(si, entry, cluster_nr);
- cluster_rest = SWAPFILE_CLUSTER;
- nr -= cluster_nr;
- entry.val += cluster_nr;
- }
-
- return has_cache;
-}
-
-/*
- * Check if it's the last ref of swap entry in the freeing path.
- * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
- */
-static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
-{
- return (count == SWAP_HAS_CACHE) || (count == 1) ||
- (count == SWAP_MAP_SHMEM);
-}
-
/*
* Drop the last ref of swap entries, caller have to ensure all entries
* belong to the same cgroup and cluster.
*/
-static void swap_entries_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages)
+void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset, unsigned int nr_pages)
{
- unsigned long offset = swp_offset(entry);
+ swp_entry_t entry = swp_entry(si->type, offset);
unsigned char *map = si->swap_map + offset;
unsigned char *map_end = map + nr_pages;
@@ -1723,7 +1704,7 @@ static void swap_entries_free(struct swap_info_struct *si,
ci->count -= nr_pages;
do {
- VM_BUG_ON(!swap_is_last_ref(*map));
+ VM_WARN_ON(*map > 1);
*map = 0;
} while (++map < map_end);
@@ -1737,55 +1718,18 @@ static void swap_entries_free(struct swap_info_struct *si,
partial_free_cluster(si, ci);
}
-/*
- * Caller has made sure that the swap device corresponding to entry
- * is still around or has not been recycled.
- */
-void swap_free_nr(swp_entry_t entry, int nr_pages)
-{
- int nr;
- struct swap_info_struct *sis;
- unsigned long offset = swp_offset(entry);
-
- sis = _swap_info_get(entry);
- if (!sis)
- return;
-
- while (nr_pages) {
- nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
- offset += nr;
- nr_pages -= nr;
- }
-}
-
-/*
- * Called after dropping swapcache to decrease refcnt to swap entries.
- */
-void put_swap_folio(struct folio *folio, swp_entry_t entry)
-{
- struct swap_info_struct *si;
- int size = 1 << swap_entry_order(folio_order(folio));
-
- si = _swap_info_get(entry);
- if (!si)
- return;
-
- swap_entries_put_cache(si, entry, size);
-}
-
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si = __swap_entry_to_info(entry);
pgoff_t offset = swp_offset(entry);
- return swap_count(si->swap_map[offset]);
+ return si->swap_map[offset];
}
-/*
- * How many references to @entry are currently swapped out?
- * This does not give an exact answer when swap count is continued,
- * but does include the high COUNT_CONTINUED flag to allow for that.
+/**
+ * swap_entry_swapped - Check if the swap entry is swapped.
+ * @si: the swap device.
+ * @entry: the swap entry.
*/
bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
@@ -1794,9 +1738,10 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
int count;
ci = swap_cluster_lock(si, offset);
- count = swap_count(si->swap_map[offset]);
+ count = si->swap_map[offset];
swap_cluster_unlock(ci);
- return !!count;
+
+ return count && count != SWAP_MAP_BAD;
}
/*
@@ -1812,7 +1757,7 @@ int swp_swapcount(swp_entry_t entry)
pgoff_t offset;
unsigned char *map;
- si = _swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si)
return 0;
@@ -1820,7 +1765,7 @@ int swp_swapcount(swp_entry_t entry)
ci = swap_cluster_lock(si, offset);
- count = swap_count(si->swap_map[offset]);
+ count = si->swap_map[offset];
if (!(count & COUNT_CONTINUED))
goto out;
@@ -1842,6 +1787,7 @@ int swp_swapcount(swp_entry_t entry)
} while (tmp_count & COUNT_CONTINUED);
out:
swap_cluster_unlock(ci);
+ put_swap_device(si);
return count;
}
@@ -1858,12 +1804,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
ci = swap_cluster_lock(si, offset);
if (nr_pages == 1) {
- if (swap_count(map[roffset]))
+ if (map[roffset])
ret = true;
goto unlock_out;
}
for (i = 0; i < nr_pages; i++) {
- if (swap_count(map[offset + i])) {
+ if (map[offset + i]) {
ret = true;
break;
}
@@ -1876,11 +1822,12 @@ unlock_out:
static bool folio_swapped(struct folio *folio)
{
swp_entry_t entry = folio->swap;
- struct swap_info_struct *si = _swap_info_get(entry);
+ struct swap_info_struct *si;
- if (!si)
- return false;
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ si = __swap_entry_to_info(entry);
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
return swap_entry_swapped(si, entry);
@@ -1939,73 +1886,45 @@ bool folio_free_swap(struct folio *folio)
}
/**
- * free_swap_and_cache_nr() - Release reference on range of swap entries and
- * reclaim their cache if no more references remain.
+ * swap_put_entries_direct() - Release reference on range of swap entries and
+ * reclaim their cache if no more references remain.
* @entry: First entry of range.
* @nr: Number of entries in range.
*
* For each swap entry in the contiguous range, release a reference. If any swap
* entries become free, try to reclaim their underlying folios, if present. The
* offset range is defined by [entry.offset, entry.offset + nr).
+ *
+ * Context: Caller must ensure there is no race condition on the reference
+ * owner. e.g., locking the PTL of a PTE containing the entry being released.
*/
-void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+void swap_put_entries_direct(swp_entry_t entry, int nr)
{
const unsigned long start_offset = swp_offset(entry);
const unsigned long end_offset = start_offset + nr;
+ unsigned long offset, cluster_end;
struct swap_info_struct *si;
- bool any_only_cache = false;
- unsigned long offset;
si = get_swap_device(entry);
- if (!si)
+ if (WARN_ON_ONCE(!si))
return;
-
- if (WARN_ON(end_offset > si->max))
- goto out;
-
- /*
- * First free all entries in the range.
- */
- any_only_cache = swap_entries_put_map_nr(si, entry, nr);
-
- /*
- * Short-circuit the below loop if none of the entries had their
- * reference drop to zero.
- */
- if (!any_only_cache)
+ if (WARN_ON_ONCE(end_offset > si->max))
goto out;
- /*
- * Now go back over the range trying to reclaim the swap cache.
- */
- for (offset = start_offset; offset < end_offset; offset += nr) {
- nr = 1;
- if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- /*
- * Folios are always naturally aligned in swap so
- * advance forward to the next boundary. Zero means no
- * folio was found for the swap entry, so advance by 1
- * in this case. Negative value means folio was found
- * but could not be reclaimed. Here we can still advance
- * to the next boundary.
- */
- nr = __try_to_reclaim_swap(si, offset,
- TTRS_UNMAPPED | TTRS_FULL);
- if (nr == 0)
- nr = 1;
- else if (nr < 0)
- nr = -nr;
- nr = ALIGN(offset + 1, nr) - offset;
- }
- }
-
+ /* Put entries and reclaim cache in each cluster */
+ offset = start_offset;
+ do {
+ cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset);
+ swap_put_entries_cluster(si, offset, cluster_end - offset, true);
+ offset = cluster_end;
+ } while (offset < end_offset);
out:
put_swap_device(si);
}
#ifdef CONFIG_HIBERNATION
-
-swp_entry_t get_swap_page_of_type(int type)
+/* Allocate a slot for hibernation */
+swp_entry_t swap_alloc_hibernation_slot(int type)
{
struct swap_info_struct *si = swap_type_to_info(type);
unsigned long offset;
@@ -2018,11 +1937,11 @@ swp_entry_t get_swap_page_of_type(int type)
if (get_swap_device_info(si)) {
if (si->flags & SWP_WRITEOK) {
/*
- * Grab the local lock to be complaint
+ * Grab the local lock to be compliant
* with swap table allocation.
*/
local_lock(&percpu_swap_cluster.lock);
- offset = cluster_alloc_swap_entry(si, 0, 1);
+ offset = cluster_alloc_swap_entry(si, NULL);
local_unlock(&percpu_swap_cluster.lock);
if (offset)
entry = swp_entry(si->type, offset);
@@ -2033,6 +1952,26 @@ fail:
return entry;
}
+/* Free a slot allocated by swap_alloc_hibernation_slot */
+void swap_free_hibernation_slot(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ pgoff_t offset = swp_offset(entry);
+
+ si = get_swap_device(entry);
+ if (WARN_ON(!si))
+ return;
+
+ ci = swap_cluster_lock(si, offset);
+ swap_put_entry_locked(si, ci, offset);
+ swap_cluster_unlock(ci);
+
+ /* In theory readahead might add it to the swap cache by accident */
+ __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ put_swap_device(si);
+}
+
/*
* Find the swap type that corresponds to given device (if any).
*
@@ -2194,7 +2133,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
/*
* Some architectures may have to restore extra metadata to the page
* when reading from swap. This metadata may be indexed by swap entry
- * so this must be called before swap_free().
+ * so this must be called before folio_put_swap().
*/
arch_swap_restore(folio_swap(entry, folio), folio);
@@ -2235,7 +2174,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
new_pte = pte_mkuffd_wp(new_pte);
setpte:
set_pte_at(vma->vm_mm, addr, pte, new_pte);
- swap_free(entry);
+ folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry)));
out:
if (pte)
pte_unmap_unlock(pte, ptl);
@@ -2430,6 +2369,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev)
{
unsigned int i;
+ unsigned long swp_tb;
unsigned char count;
/*
@@ -2440,7 +2380,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
*/
for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
- if (count && swap_count(count) != SWAP_MAP_BAD)
+ swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
+ i % SWAPFILE_CLUSTER);
+ if (count == SWAP_MAP_BAD)
+ continue;
+ if (count || swp_tb_is_folio(swp_tb))
break;
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
@@ -3650,67 +3594,39 @@ void si_swapinfo(struct sysinfo *val)
* Returns error code in following case.
* - success -> 0
* - swp_entry is invalid -> EINVAL
- * - swap-cache reference is requested but there is already one. -> EEXIST
- * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference is requested but the entry is not used. -> ENOENT
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
*/
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
+static int swap_dup_entries(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset,
+ unsigned char usage, int nr)
{
- struct swap_info_struct *si;
- struct swap_cluster_info *ci;
- unsigned long offset;
+ int i;
unsigned char count;
- unsigned char has_cache;
- int err, i;
-
- si = swap_entry_to_info(entry);
- if (WARN_ON_ONCE(!si)) {
- pr_err("%s%08lx\n", Bad_file, entry.val);
- return -EINVAL;
- }
-
- offset = swp_offset(entry);
- VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- VM_WARN_ON(usage == 1 && nr > 1);
- ci = swap_cluster_lock(si, offset);
- err = 0;
for (i = 0; i < nr; i++) {
count = si->swap_map[offset + i];
-
/*
- * swapin_readahead() doesn't check if a swap entry is valid, so the
- * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+ * For swapin out, allocator never allocates bad slots. for
+ * swapin, readahead is guarded by swap_entry_swapped.
*/
- if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
- err = -ENOENT;
- goto unlock_out;
- }
-
- has_cache = count & SWAP_HAS_CACHE;
- count &= ~SWAP_HAS_CACHE;
-
- if (!count && !has_cache) {
- err = -ENOENT;
- } else if (usage == SWAP_HAS_CACHE) {
- if (has_cache)
- err = -EEXIST;
- } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
- err = -EINVAL;
- }
-
- if (err)
- goto unlock_out;
+ if (WARN_ON(count == SWAP_MAP_BAD))
+ return -ENOENT;
+ /*
+ * Swap count duplication must be guarded by either swap cache folio (from
+ * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
+ */
+ if (WARN_ON(!count &&
+ !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
+ return -ENOENT;
+ if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
+ return -EINVAL;
}
for (i = 0; i < nr; i++) {
count = si->swap_map[offset + i];
- has_cache = count & SWAP_HAS_CACHE;
- count &= ~SWAP_HAS_CACHE;
-
- if (usage == SWAP_HAS_CACHE)
- has_cache = SWAP_HAS_CACHE;
- else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
count += usage;
else if (swap_count_continued(si, offset + i, count))
count = COUNT_CONTINUED;
@@ -3719,66 +3635,56 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
* Don't need to rollback changes, because if
* usage == 1, there must be nr == 1.
*/
- err = -ENOMEM;
- goto unlock_out;
+ return -ENOMEM;
}
- WRITE_ONCE(si->swap_map[offset + i], count | has_cache);
+ WRITE_ONCE(si->swap_map[offset + i], count);
}
-unlock_out:
- swap_cluster_unlock(ci);
- return err;
+ return 0;
}
-/*
- * Help swapoff by noting that swap entry belongs to shmem/tmpfs
- * (in which case its reference count is never incremented).
- */
-void swap_shmem_alloc(swp_entry_t entry, int nr)
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
{
- __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
+ int err;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ si = swap_entry_to_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
+
+ VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
+ ci = swap_cluster_lock(si, offset);
+ err = swap_dup_entries(si, ci, offset, usage, nr);
+ swap_cluster_unlock(ci);
+ return err;
}
/*
- * Increase reference count of swap entry by 1.
+ * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * @entry: first swap entry from which we want to increase the refcount.
+ *
* Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
* but could not be atomically allocated. Returns 0, just as if it succeeded,
* if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
* might occur if a page table entry has got corrupted.
+ *
+ * Context: Caller must ensure there is no race condition on the reference
+ * owner. e.g., locking the PTL of a PTE containing the entry being increased.
*/
-int swap_duplicate(swp_entry_t entry)
+int swap_dup_entry_direct(swp_entry_t entry)
{
int err = 0;
-
while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
err = add_swap_count_continuation(entry, GFP_ATOMIC);
return err;
}
/*
- * @entry: first swap entry from which we allocate nr swap cache.
- *
- * Called when allocating swap cache for existing swap entries,
- * This can return error codes. Returns 0 at success.
- * -EEXIST means there is a swap cache.
- * Note: return code is different from swap_duplicate().
- */
-int swapcache_prepare(swp_entry_t entry, int nr)
-{
- return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
-}
-
-/*
- * Caller should ensure entries belong to the same folio so
- * the entries won't span cross cluster boundary.
- */
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
-{
- swap_entries_put_cache(si, entry, nr);
-}
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -3823,7 +3729,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
ci = swap_cluster_lock(si, offset);
- count = swap_count(si->swap_map[offset]);
+ count = si->swap_map[offset];
if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
/*
@@ -3895,7 +3801,7 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of swap_entry_put_locked()
+ * Called while __swap_duplicate() or caller of swap_put_entry_locked()
* holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,
diff --git a/mm/tests/lazy_mmu_mode_kunit.c b/mm/tests/lazy_mmu_mode_kunit.c
new file mode 100644
index 000000000000..b689241c6bef
--- /dev/null
+++ b/mm/tests/lazy_mmu_mode_kunit.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/pgtable.h>
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+static void expect_not_active(struct kunit *test)
+{
+ KUNIT_EXPECT_FALSE(test, is_lazy_mmu_mode_active());
+}
+
+static void expect_active(struct kunit *test)
+{
+ KUNIT_EXPECT_TRUE(test, is_lazy_mmu_mode_active());
+}
+
+static void lazy_mmu_mode_active(struct kunit *test)
+{
+ expect_not_active(test);
+
+ lazy_mmu_mode_enable();
+ expect_active(test);
+
+ {
+ /* Nested section */
+ lazy_mmu_mode_enable();
+ expect_active(test);
+
+ lazy_mmu_mode_disable();
+ expect_active(test);
+ }
+
+ {
+ /* Paused section */
+ lazy_mmu_mode_pause();
+ expect_not_active(test);
+
+ {
+ /* No effect (paused) */
+ lazy_mmu_mode_enable();
+ expect_not_active(test);
+
+ lazy_mmu_mode_disable();
+ expect_not_active(test);
+
+ lazy_mmu_mode_pause();
+ expect_not_active(test);
+
+ lazy_mmu_mode_resume();
+ expect_not_active(test);
+ }
+
+ lazy_mmu_mode_resume();
+ expect_active(test);
+ }
+
+ lazy_mmu_mode_disable();
+ expect_not_active(test);
+}
+
+static struct kunit_case lazy_mmu_mode_test_cases[] = {
+ KUNIT_CASE(lazy_mmu_mode_active),
+ {}
+};
+
+static struct kunit_suite lazy_mmu_mode_test_suite = {
+ .name = "lazy_mmu_mode",
+ .test_cases = lazy_mmu_mode_test_cases,
+};
+kunit_test_suite(lazy_mmu_mode_test_suite);
+
+MODULE_DESCRIPTION("Tests for the lazy MMU mode");
+MODULE_LICENSE("GPL");
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e6dfd5f28acd..927086bb4a3c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1103,7 +1103,7 @@ static long move_present_ptes(struct mm_struct *mm,
/* It's safe to drop the reference now as the page-table is holding one. */
folio_put(*first_src_folio);
*first_src_folio = NULL;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
while (true) {
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
@@ -1140,7 +1140,7 @@ static long move_present_ptes(struct mm_struct *mm,
break;
}
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
if (src_addr > src_start)
flush_tlb_range(src_vma, src_start, src_addr);
@@ -1190,17 +1190,13 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
* Check if the swap entry is cached after acquiring the src_pte
* lock. Otherwise, we might miss a newly loaded swap cache folio.
*
- * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
* We are trying to catch newly added swap cache, the only possible case is
* when a folio is swapped in and out again staying in swap cache, using the
* same entry before the PTE check above. The PTL is acquired and released
- * twice, each time after updating the swap_map's flag. So holding
- * the PTL here ensures we see the updated value. False positive is possible,
- * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
- * cache, or during the tiny synchronization window between swap cache and
- * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ * twice, each time after updating the swap table. So holding
+ * the PTL here ensures we see the updated value.
*/
- if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ if (swap_cache_has_folio(entry)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
@@ -1274,7 +1270,7 @@ retry:
* Use the maywrite version to indicate that dst_pte will be modified,
* since dst_pte needs to be none, the subsequent pte_same() check
* cannot prevent the dst_pte page from being freed concurrently, so we
- * also need to abtain dst_pmdval and recheck pmd_same() later.
+ * also need to obtain dst_pmdval and recheck pmd_same() later.
*/
dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
&dst_ptl);
@@ -1330,7 +1326,7 @@ retry:
goto out;
}
- /* If PTE changed after we locked the folio them start over */
+ /* If PTE changed after we locked the folio then start over */
if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
ret = -EAGAIN;
goto out;
diff --git a/mm/vma.c b/mm/vma.c
index 7a908a964d18..3dbe414eff89 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -381,7 +381,7 @@ again:
fput(vp->file);
}
if (vp->remove->anon_vma)
- anon_vma_merge(vp->vma, vp->remove);
+ unlink_anon_vmas(vp->remove);
mm->map_count--;
mpol_put(vma_policy(vp->remove));
if (!vp->remove2)
@@ -530,7 +530,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (err)
goto out_free_vmi;
- err = anon_vma_clone(new, vma);
+ err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
if (err)
goto out_free_mpol;
@@ -628,7 +628,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
vma_assert_write_locked(dst);
dst->anon_vma = src->anon_vma;
- ret = anon_vma_clone(dst, src);
+ ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
if (ret)
return ret;
@@ -1901,7 +1901,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
vma_set_range(new_vma, addr, addr + len, pgoff);
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
- if (anon_vma_clone(new_vma, vma))
+ if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
@@ -2951,10 +2951,10 @@ retry:
return -ENOMEM;
/*
- * Adjust for the gap first so it doesn't interfere with the
- * later alignment. The first step is the minimum needed to
- * fulill the start gap, the next steps is the minimum to align
- * that. It is the minimum needed to fulill both.
+ * Adjust for the gap first so it doesn't interfere with the later
+ * alignment. The first step is the minimum needed to fulfill the start
+ * gap, the next step is the minimum to align that. It is the minimum
+ * needed to fulfill both.
*/
gap = vma_iter_addr(&vmi) + info->start_gap;
gap += (info->align_offset - gap) & info->align_mask;
diff --git a/mm/vma.h b/mm/vma.h
index 9d5ee6ac913a..d51efd9da113 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -267,7 +267,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
/**
- * vma_modify_flags() - Peform any necessary split/merge in preparation for
+ * vma_modify_flags() - Perform any necessary split/merge in preparation for
* setting VMA flags to *@vm_flags in the range @start to @end contained within
* @vma.
* @vmi: Valid VMA iterator positioned at @vma.
@@ -295,7 +295,7 @@ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
vm_flags_t *vm_flags_ptr);
/**
- * vma_modify_name() - Peform any necessary split/merge in preparation for
+ * vma_modify_name() - Perform any necessary split/merge in preparation for
* setting anonymous VMA name to @new_name in the range @start to @end contained
* within @vma.
* @vmi: Valid VMA iterator positioned at @vma.
@@ -319,7 +319,7 @@ __must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
struct anon_vma_name *new_name);
/**
- * vma_modify_policy() - Peform any necessary split/merge in preparation for
+ * vma_modify_policy() - Perform any necessary split/merge in preparation for
* setting NUMA policy to @new_pol in the range @start to @end contained
* within @vma.
* @vmi: Valid VMA iterator positioned at @vma.
@@ -343,7 +343,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
struct mempolicy *new_pol);
/**
- * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for
+ * vma_modify_flags_uffd() - Perform any necessary split/merge in preparation for
* setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range
* @start to @end contained within @vma.
* @vmi: Valid VMA iterator positioned at @vma.
@@ -561,12 +561,6 @@ static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
return vmi->mas.last + 1;
}
-static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
- unsigned long count)
-{
- return mas_expected_entries(&vmi->mas, count);
-}
-
static inline
struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e286c2d2068c..03e1117480d5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -305,6 +305,11 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end,
int err;
pgtbl_mod_mask mask = 0;
+ /*
+ * Might allocate pagetables (for most archs a more precise annotation
+ * would be might_alloc(GFP_PGTABLE_KERNEL)). Also might shootdown TLB
+ * (requires IRQs enabled on x86).
+ */
might_sleep();
BUG_ON(addr >= end);
@@ -366,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
#ifdef CONFIG_HUGETLB_PAGE
@@ -385,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -533,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
struct page *page = pages[*nr];
@@ -555,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return err;
@@ -2268,11 +2273,14 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
reclaim_list_global(&decay_list);
}
+#define KASAN_RELEASE_BATCH_SIZE 32
+
static void
kasan_release_vmalloc_node(struct vmap_node *vn)
{
struct vmap_area *va;
unsigned long start, end;
+ unsigned int batch_count = 0;
start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
@@ -2282,6 +2290,11 @@ kasan_release_vmalloc_node(struct vmap_node *vn)
kasan_release_vmalloc(va->va_start, va->va_end,
va->va_start, va->va_end,
KASAN_VMALLOC_PAGE_RANGE);
+
+ if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) {
+ cond_resched();
+ batch_count = 0;
+ }
}
kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
@@ -4354,6 +4367,7 @@ need_realloc:
return n;
}
+EXPORT_SYMBOL(vrealloc_node_align_noprof);
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 614ccf39fe3f..3fc4a4461927 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,7 +63,6 @@
#include <asm/div64.h>
#include <linux/swapops.h>
-#include <linux/balloon_compaction.h>
#include <linux/sched/sysctl.h>
#include "internal.h"
@@ -104,13 +103,13 @@ struct scan_control {
unsigned int force_deactivate:1;
unsigned int skipped_deactivate:1;
- /* Writepage batching in laptop mode; RECLAIM_WRITE */
+ /* zone_reclaim_mode, boost reclaim */
unsigned int may_writepage:1;
- /* Can mapped folios be reclaimed? */
+ /* zone_reclaim_mode */
unsigned int may_unmap:1;
- /* Can folios be swapped as part of reclaim? */
+ /* zome_reclaim_mode, boost reclaim, cgroup restrictions */
unsigned int may_swap:1;
/* Not allow cache_trim_mode to be turned on as part of reclaim? */
@@ -507,7 +506,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+ if (kswapd_test_hopeless(pgdat))
return true;
/*
@@ -758,10 +757,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __swap_cache_del_folio(ci, folio, swap, shadow);
memcg1_swapout(folio, swap);
+ __swap_cache_del_folio(ci, folio, swap, shadow);
swap_cluster_unlock_irq(ci);
- put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -1063,7 +1061,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
/*
* We can "enter_fs" for swap-cache with only __GFP_IO
* providing this isn't SWP_FS_OPS.
- * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * ->flags can be updated non-atomically (scan_swap_map_slots),
* but that will never affect SWP_FS_OPS, so the data_race
* is safe.
*/
@@ -1276,58 +1274,58 @@ retry:
* Try to allocate it some swap space here.
* Lazyfree folio could be freed directly
*/
- if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
- if (!folio_test_swapcache(folio)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (folio_maybe_dma_pinned(folio))
- goto keep_locked;
- if (folio_test_large(folio)) {
- /* cannot split folio, skip it */
- if (folio_expected_ref_count(folio) !=
- folio_ref_count(folio) - 1)
- goto activate_locked;
- /*
- * Split partially mapped folios right away.
- * We can free the unmapped pages without IO.
- */
- if (data_race(!list_empty(&folio->_deferred_list) &&
- folio_test_partially_mapped(folio)) &&
- split_folio_to_list(folio, folio_list))
- goto activate_locked;
- }
- if (folio_alloc_swap(folio)) {
- int __maybe_unused order = folio_order(folio);
-
- if (!folio_test_large(folio))
- goto activate_locked_split;
- /* Fallback to swap normal pages */
- if (split_folio_to_list(folio, folio_list))
- goto activate_locked;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (nr_pages >= HPAGE_PMD_NR) {
- count_memcg_folio_events(folio,
- THP_SWPOUT_FALLBACK, 1);
- count_vm_event(THP_SWPOUT_FALLBACK);
- }
-#endif
- count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (folio_alloc_swap(folio))
- goto activate_locked_split;
- }
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (folio_maybe_dma_pinned(folio))
+ goto keep_locked;
+ if (folio_test_large(folio)) {
+ /* cannot split folio, skip it */
+ if (folio_expected_ref_count(folio) !=
+ folio_ref_count(folio) - 1)
+ goto activate_locked;
/*
- * Normally the folio will be dirtied in unmap because its
- * pte should be dirty. A special case is MADV_FREE page. The
- * page's pte could have dirty bit cleared but the folio's
- * SwapBacked flag is still set because clearing the dirty bit
- * and SwapBacked flag has no lock protected. For such folio,
- * unmap will not set dirty bit for it, so folio reclaim will
- * not write the folio out. This can cause data corruption when
- * the folio is swapped in later. Always setting the dirty flag
- * for the folio solves the problem.
+ * Split partially mapped folios right away.
+ * We can free the unmapped pages without IO.
*/
- folio_mark_dirty(folio);
+ if (data_race(!list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio)) &&
+ split_folio_to_list(folio, folio_list))
+ goto activate_locked;
}
+ if (folio_alloc_swap(folio)) {
+ int __maybe_unused order = folio_order(folio);
+
+ if (!folio_test_large(folio))
+ goto activate_locked_split;
+ /* Fallback to swap normal pages */
+ if (split_folio_to_list(folio, folio_list))
+ goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (nr_pages >= HPAGE_PMD_NR) {
+ count_memcg_folio_events(folio,
+ THP_SWPOUT_FALLBACK, 1);
+ count_vm_event(THP_SWPOUT_FALLBACK);
+ }
+#endif
+ count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
+ if (folio_alloc_swap(folio))
+ goto activate_locked_split;
+ }
+ /*
+ * Normally the folio will be dirtied in unmap because
+ * its pte should be dirty. A special case is MADV_FREE
+ * page. The page's pte could have dirty bit cleared but
+ * the folio's SwapBacked flag is still set because
+ * clearing the dirty bit and SwapBacked flag has no
+ * lock protected. For such folio, unmap will not set
+ * dirty bit for it, so folio reclaim will not write the
+ * folio out. This can cause data corruption when the
+ * folio is swapped in later. Always setting the dirty
+ * flag for the folio solves the problem.
+ */
+ folio_mark_dirty(folio);
}
/*
@@ -2451,9 +2449,9 @@ static inline void calculate_pressure_balance(struct scan_control *sc,
static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long scan)
{
- unsigned long min, low;
+ unsigned long min, low, usage;
- mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low);
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low, &usage);
if (min || low) {
/*
@@ -2485,7 +2483,6 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
* again by how much of the total memory used is under
* hard protection.
*/
- unsigned long cgroup_size = mem_cgroup_size(memcg);
unsigned long protection;
/* memory.low scaling, make sure we retry before OOM */
@@ -2497,9 +2494,9 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
}
/* Avoid TOCTOU with earlier protection check */
- cgroup_size = max(cgroup_size, protection);
+ usage = max(usage, protection);
- scan -= scan * protection / (cgroup_size + 1);
+ scan -= scan * protection / (usage + 1);
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -3516,7 +3513,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
return false;
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
@@ -3557,7 +3554,7 @@ restart:
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
pte_unmap_unlock(pte, ptl);
return suitable_to_scan(total, young);
@@ -3598,7 +3595,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (!spin_trylock(ptl))
goto done;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
unsigned long pfn;
@@ -3645,7 +3642,7 @@ next:
walk_update_folio(walk, last, gen, dirty);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
spin_unlock(ptl);
done:
*first = -1;
@@ -4244,7 +4241,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
}
}
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
pte -= (addr - start) / PAGE_SIZE;
@@ -4278,7 +4275,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
walk_update_folio(walk, last, gen, dirty);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
@@ -5067,7 +5064,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
blk_finish_plug(&plug);
done:
if (sc->nr_reclaimed > reclaimed)
- atomic_set(&pgdat->kswapd_failures, 0);
+ kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
}
/******************************************************************************
@@ -5417,7 +5414,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
if (memcg)
cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
#endif
- seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ seq_printf(m, "memcg %llu %s\n", mem_cgroup_id(memcg), path);
}
seq_printf(m, " node %5d\n", nid);
@@ -5502,7 +5499,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
return -EINTR;
}
-static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+static int run_cmd(char cmd, u64 memcg_id, int nid, unsigned long seq,
struct scan_control *sc, int swappiness, unsigned long opt)
{
struct lruvec *lruvec;
@@ -5513,14 +5510,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
return -EINVAL;
if (!mem_cgroup_disabled()) {
- rcu_read_lock();
-
- memcg = mem_cgroup_from_id(memcg_id);
- if (!mem_cgroup_tryget(memcg))
- memcg = NULL;
-
- rcu_read_unlock();
-
+ memcg = mem_cgroup_get_from_id(memcg_id);
if (!memcg)
return -EINVAL;
}
@@ -5592,7 +5582,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
int n;
int end;
char cmd, swap_string[5];
- unsigned int memcg_id;
+ u64 memcg_id;
unsigned int nid;
unsigned long seq;
unsigned int swappiness;
@@ -5602,7 +5592,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
if (!*cur)
continue;
- n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
+ n = sscanf(cur, "%c %llu %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
&seq, &end, swap_string, &end, &opt, &end);
if (n < 4 || cur[end]) {
err = -EINVAL;
@@ -6141,7 +6131,7 @@ again:
* successful direct reclaim run will revive a dormant kswapd.
*/
if (reclaimable)
- atomic_set(&pgdat->kswapd_failures, 0);
+ kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
else if (sc->cache_trim_mode)
sc->cache_trim_mode_failed = 1;
}
@@ -6366,13 +6356,6 @@ retry:
if (sc->compaction_ready)
break;
-
- /*
- * If we're getting trouble reclaiming, start doing
- * writepage even in laptop mode.
- */
- if (sc->priority < DEF_PRIORITY - 2)
- sc->may_writepage = 1;
} while (--sc->priority >= 0);
last_pgdat = NULL;
@@ -6453,7 +6436,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
int i;
bool wmark_ok;
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+ if (kswapd_test_hopeless(pgdat))
return true;
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
@@ -6581,7 +6564,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.order = order,
.nodemask = nodemask,
.priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
+ .may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
};
@@ -6625,7 +6608,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.target_mem_cgroup = memcg,
- .may_writepage = !laptop_mode,
+ .may_writepage = 1,
.may_unmap = 1,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
@@ -6671,7 +6654,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
+ .may_writepage = 1,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
@@ -6862,7 +6845,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
wake_up_all(&pgdat->pfmemalloc_wait);
/* Hopeless node, leave it to direct reclaim */
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+ if (kswapd_test_hopeless(pgdat))
return true;
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7052,7 +7035,7 @@ restart:
* from reclaim context. If no pages are reclaimed, the
* reclaim will be aborted.
*/
- sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+ sc.may_writepage = !nr_boost_reclaim;
sc.may_swap = !nr_boost_reclaim;
/*
@@ -7062,13 +7045,6 @@ restart:
*/
kswapd_age_node(pgdat, &sc);
- /*
- * If we're getting trouble reclaiming, start doing writepage
- * even in laptop mode.
- */
- if (sc.priority < DEF_PRIORITY - 2)
- sc.may_writepage = 1;
-
/* Call soft limit reclaim before calling shrink_node. */
sc.nr_scanned = 0;
nr_soft_scanned = 0;
@@ -7134,8 +7110,11 @@ restart:
* watermark_high at this point. We need to avoid increasing the
* failure count to prevent the kswapd thread from stopping.
*/
- if (!sc.nr_reclaimed && !boosted)
- atomic_inc(&pgdat->kswapd_failures);
+ if (!sc.nr_reclaimed && !boosted) {
+ int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures);
+ /* kswapd context, low overhead to trace every failure */
+ trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt);
+ }
out:
clear_reclaim_active(pgdat, highest_zoneidx);
@@ -7394,7 +7373,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
/* Hopeless node, leave it to direct reclaim if possible */
- if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
+ if (kswapd_test_hopeless(pgdat) ||
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
@@ -7414,6 +7393,32 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
wake_up_interruptible(&pgdat->kswapd_wait);
}
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason)
+{
+ /* Only trace actual resets, not redundant zero-to-zero */
+ if (atomic_xchg(&pgdat->kswapd_failures, 0))
+ trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason);
+}
+
+/*
+ * Reset kswapd_failures only when the node is balanced. Without this
+ * check, successful direct reclaim (e.g., from cgroup memory.high
+ * throttling) can keep resetting kswapd_failures even when the node
+ * cannot be balanced, causing kswapd to run endlessly.
+ */
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+ unsigned int order, int highest_zoneidx)
+{
+ if (pgdat_balanced(pgdat, order, highest_zoneidx))
+ kswapd_clear_hopeless(pgdat, current_is_kswapd() ?
+ KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT);
+}
+
+bool kswapd_test_hopeless(pg_data_t *pgdat)
+{
+ return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES;
+}
+
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -7465,8 +7470,8 @@ void __meminit kswapd_run(int nid)
pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- pr_err("Failed to start kswapd on node %d,ret=%ld\n",
- nid, PTR_ERR(pgdat->kswapd));
+ pr_err("Failed to start kswapd on node %d, ret=%pe\n",
+ nid, pgdat->kswapd);
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
} else {
@@ -7800,7 +7805,7 @@ int user_proactive_reclaim(char *buf,
.reclaim_idx = gfp_zone(gfp_mask),
.proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
.priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
+ .may_writepage = 1,
.nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
.may_unmap = 1,
.may_swap = 1,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d6e814c82952..86b14b0f77b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -672,11 +672,6 @@ void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
EXPORT_SYMBOL(mod_node_page_state);
-void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-{
- mod_node_state(pgdat, item, 1, 1);
-}
-
void inc_node_page_state(struct page *page, enum node_stat_item item)
{
mod_node_state(page_pgdat(page), item, 1, 1);
@@ -725,16 +720,6 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(dec_zone_page_state);
-void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __inc_node_state(pgdat, item);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(inc_node_state);
-
void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
@@ -1434,13 +1419,13 @@ const char * const vmstat_text[] = {
[I(THP_SWPOUT)] = "thp_swpout",
[I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
#endif
-#ifdef CONFIG_MEMORY_BALLOON
+#ifdef CONFIG_BALLOON
[I(BALLOON_INFLATE)] = "balloon_inflate",
[I(BALLOON_DEFLATE)] = "balloon_deflate",
-#ifdef CONFIG_BALLOON_COMPACTION
+#ifdef CONFIG_BALLOON_MIGRATION
[I(BALLOON_MIGRATE)] = "balloon_migrate",
-#endif
-#endif /* CONFIG_MEMORY_BALLOON */
+#endif /* CONFIG_BALLOON_MIGRATION */
+#endif /* CONFIG_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
[I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush",
[I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received",
@@ -1626,7 +1611,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
}
}
-/* Print out the free pages at each order for each migatetype */
+/* Print out the free pages at each order for each migratetype */
static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
{
int order;
@@ -1855,7 +1840,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n start_pfn: %lu"
"\n reserved_highatomic: %lu"
"\n free_highatomic: %lu",
- atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
+ kswapd_test_hopeless(pgdat),
zone->zone_start_pfn,
zone->nr_reserved_highatomic,
zone->nr_free_highatomic);
@@ -2281,7 +2266,8 @@ void __init init_mm_internals(void)
{
int ret __maybe_unused;
- mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
#ifdef CONFIG_SMP
ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
diff --git a/mm/workingset.c b/mm/workingset.c
index e9f05634747a..13422d304715 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -254,7 +254,7 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
+ return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset);
}
/*
@@ -271,7 +271,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
- memcg = mem_cgroup_from_id(memcg_id);
+ memcg = mem_cgroup_from_private_id(memcg_id);
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
@@ -395,7 +395,7 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
- memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
eviction >>= bucket_order;
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
@@ -456,7 +456,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
* would be better if the root_mem_cgroup existed in all
* configurations instead.
*/
- eviction_memcg = mem_cgroup_from_id(memcgid);
+ eviction_memcg = mem_cgroup_from_private_id(memcgid);
if (!mem_cgroup_tryget(eviction_memcg))
eviction_memcg = NULL;
rcu_read_unlock();
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5bf832f9c05c..d5d1c27b3852 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -30,6 +30,7 @@
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/scatterlist.h>
#include <linux/spinlock.h>
#include <linux/sprintf.h>
#include <linux/shrinker.h>
@@ -105,7 +106,7 @@
/*
* On systems with 4K page size, this gives 255 size classes! There is a
- * trader-off here:
+ * trade-off here:
* - Large number of size classes is potentially wasteful as free page are
* spread across these classes
* - Small number of size classes causes large internal fragmentation
@@ -192,12 +193,13 @@ struct link_free {
};
};
+static struct kmem_cache *handle_cachep;
+static struct kmem_cache *zspage_cachep;
+
struct zs_pool {
const char *name;
struct size_class *size_class[ZS_SIZE_CLASSES];
- struct kmem_cache *handle_cachep;
- struct kmem_cache *zspage_cachep;
atomic_long_t pages_allocated;
@@ -370,60 +372,28 @@ static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
#endif
-static int create_cache(struct zs_pool *pool)
+static unsigned long cache_alloc_handle(gfp_t gfp)
{
- char *name;
-
- name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name);
- if (!name)
- return -ENOMEM;
- pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE,
- 0, 0, NULL);
- kfree(name);
- if (!pool->handle_cachep)
- return -EINVAL;
-
- name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name);
- if (!name)
- return -ENOMEM;
- pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage),
- 0, 0, NULL);
- kfree(name);
- if (!pool->zspage_cachep) {
- kmem_cache_destroy(pool->handle_cachep);
- pool->handle_cachep = NULL;
- return -EINVAL;
- }
-
- return 0;
-}
+ gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
-static void destroy_cache(struct zs_pool *pool)
-{
- kmem_cache_destroy(pool->handle_cachep);
- kmem_cache_destroy(pool->zspage_cachep);
+ return (unsigned long)kmem_cache_alloc(handle_cachep, gfp);
}
-static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
+static void cache_free_handle(unsigned long handle)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ kmem_cache_free(handle_cachep, (void *)handle);
}
-static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
+static struct zspage *cache_alloc_zspage(gfp_t gfp)
{
- kmem_cache_free(pool->handle_cachep, (void *)handle);
-}
+ gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
-static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
-{
- return kmem_cache_zalloc(pool->zspage_cachep,
- flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ return kmem_cache_zalloc(zspage_cachep, gfp);
}
-static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+static void cache_free_zspage(struct zspage *zspage)
{
- kmem_cache_free(pool->zspage_cachep, zspage);
+ kmem_cache_free(zspage_cachep, zspage);
}
/* class->lock(which owns the handle) synchronizes races */
@@ -852,7 +822,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
zpdesc = next;
} while (zpdesc != NULL);
- cache_free_zspage(pool, zspage);
+ cache_free_zspage(zspage);
class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
@@ -965,7 +935,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
{
int i;
struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
- struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+ struct zspage *zspage = cache_alloc_zspage(gfp);
if (!zspage)
return NULL;
@@ -987,7 +957,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
zpdesc_dec_zone_page_state(zpdescs[i]);
free_zpdesc(zpdescs[i]);
}
- cache_free_zspage(pool, zspage);
+ cache_free_zspage(zspage);
return NULL;
}
__zpdesc_set_zsmalloc(zpdesc);
@@ -1065,7 +1035,7 @@ unsigned long zs_get_total_pages(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_get_total_pages);
void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
- void *local_copy)
+ size_t mem_len, void *local_copy)
{
struct zspage *zspage;
struct zpdesc *zpdesc;
@@ -1087,7 +1057,10 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
/* this object is contained entirely within a page */
addr = kmap_local_zpdesc(zpdesc);
addr += off;
@@ -1096,7 +1069,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
/* this object spans two pages */
sizes[0] = PAGE_SIZE - off;
- sizes[1] = class->size - sizes[0];
+ sizes[1] = mem_len - sizes[0];
addr = local_copy;
memcpy_from_page(addr, zpdesc_page(zpdesc),
@@ -1107,15 +1080,12 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
0, sizes[1]);
}
- if (!ZsHugePage(zspage))
- addr += ZS_HANDLE_SIZE;
-
return addr;
}
EXPORT_SYMBOL_GPL(zs_obj_read_begin);
void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
- void *handle_mem)
+ size_t mem_len, void *handle_mem)
{
struct zspage *zspage;
struct zpdesc *zpdesc;
@@ -1129,9 +1099,10 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- if (off + class->size <= PAGE_SIZE) {
- if (!ZsHugePage(zspage))
- off += ZS_HANDLE_SIZE;
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
handle_mem -= off;
kunmap_local(handle_mem);
}
@@ -1140,6 +1111,68 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
}
EXPORT_SYMBOL_GPL(zs_obj_read_end);
+void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle,
+ struct scatterlist *sg, size_t mem_len)
+{
+ struct zspage *zspage;
+ struct zpdesc *zpdesc;
+ unsigned long obj, off;
+ unsigned int obj_idx;
+ struct size_class *class;
+
+ /* Guarantee we can get zspage from handle safely */
+ read_lock(&pool->lock);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
+
+ /* Make sure migration doesn't move any pages in this zspage */
+ zspage_read_lock(zspage);
+ read_unlock(&pool->lock);
+
+ class = zspage_class(pool, zspage);
+ off = offset_in_page(class->size * obj_idx);
+
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ sg_init_table(sg, 1);
+ sg_set_page(sg, zpdesc_page(zpdesc), mem_len, off);
+ } else {
+ size_t sizes[2];
+
+ /* this object spans two pages */
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = mem_len - sizes[0];
+
+ sg_init_table(sg, 2);
+ sg_set_page(sg, zpdesc_page(zpdesc), sizes[0], off);
+
+ zpdesc = get_next_zpdesc(zpdesc);
+ sg = sg_next(sg);
+
+ sg_set_page(sg, zpdesc_page(zpdesc), sizes[1], 0);
+ }
+}
+EXPORT_SYMBOL_GPL(zs_obj_read_sg_begin);
+
+void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle)
+{
+ struct zspage *zspage;
+ struct zpdesc *zpdesc;
+ unsigned long obj;
+ unsigned int obj_idx;
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
+
+ zspage_read_unlock(zspage);
+}
+EXPORT_SYMBOL_GPL(zs_obj_read_sg_end);
+
void zs_obj_write(struct zs_pool *pool, unsigned long handle,
void *handle_mem, size_t mem_len)
{
@@ -1275,7 +1308,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
if (unlikely(size > ZS_MAX_ALLOC_SIZE))
return (unsigned long)ERR_PTR(-ENOSPC);
- handle = cache_alloc_handle(pool, gfp);
+ handle = cache_alloc_handle(gfp);
if (!handle)
return (unsigned long)ERR_PTR(-ENOMEM);
@@ -1299,7 +1332,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
zspage = alloc_zspage(pool, class, gfp, nid);
if (!zspage) {
- cache_free_handle(pool, handle);
+ cache_free_handle(handle);
return (unsigned long)ERR_PTR(-ENOMEM);
}
@@ -1379,7 +1412,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
free_zspage(pool, class, zspage);
spin_unlock(&class->lock);
- cache_free_handle(pool, handle);
+ cache_free_handle(handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -2041,9 +2074,6 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
- if (create_cache(pool))
- goto err;
-
/*
* Iterate reversely, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
@@ -2165,20 +2195,47 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
- destroy_cache(pool);
kfree(pool->name);
kfree(pool);
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
+static void zs_destroy_caches(void)
+{
+ kmem_cache_destroy(handle_cachep);
+ handle_cachep = NULL;
+ kmem_cache_destroy(zspage_cachep);
+ zspage_cachep = NULL;
+}
+
+static int __init zs_init_caches(void)
+{
+ handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
+ 0, 0, NULL);
+
+ if (!handle_cachep || !zspage_cachep) {
+ zs_destroy_caches();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
static int __init zs_init(void)
{
- int rc __maybe_unused;
+ int rc;
+
+ rc = zs_init_caches();
+ if (rc)
+ return rc;
#ifdef CONFIG_COMPACTION
rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
- if (rc)
+ if (rc) {
+ zs_destroy_caches();
return rc;
+ }
#endif
zs_stat_init();
return 0;
@@ -2190,6 +2247,7 @@ static void __exit zs_exit(void)
set_movable_ops(NULL, PGTY_zsmalloc);
#endif
zs_stat_exit();
+ zs_destroy_caches();
}
module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index ac9b7a60736b..af3f0fbb0558 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -26,6 +26,7 @@
#include <linux/mempolicy.h>
#include <linux/mempool.h>
#include <crypto/acompress.h>
+#include <crypto/scatterwalk.h>
#include <linux/zswap.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
@@ -141,7 +142,6 @@ struct crypto_acomp_ctx {
struct crypto_wait wait;
u8 *buffer;
struct mutex mutex;
- bool is_sleepable;
};
/*
@@ -749,8 +749,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
- pr_err("could not alloc crypto acomp %s : %ld\n",
- pool->tfm_name, PTR_ERR(acomp));
+ pr_err("could not alloc crypto acomp %s : %pe\n",
+ pool->tfm_name, acomp);
ret = PTR_ERR(acomp);
goto fail;
}
@@ -781,7 +781,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_ctx->buffer = buffer;
acomp_ctx->acomp = acomp;
- acomp_ctx->is_sleepable = acomp_is_async(acomp);
acomp_ctx->req = req;
mutex_unlock(&acomp_ctx->mutex);
return 0;
@@ -933,52 +932,41 @@ unlock:
static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
{
struct zswap_pool *pool = entry->pool;
- struct scatterlist input, output;
+ struct scatterlist input[2]; /* zsmalloc returns an SG list 1-2 entries */
+ struct scatterlist output;
struct crypto_acomp_ctx *acomp_ctx;
- int decomp_ret = 0, dlen = PAGE_SIZE;
- u8 *src, *obj;
+ int ret = 0, dlen;
acomp_ctx = acomp_ctx_get_cpu_lock(pool);
- obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer);
+ zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length);
/* zswap entries of length PAGE_SIZE are not compressed. */
if (entry->length == PAGE_SIZE) {
- memcpy_to_folio(folio, 0, obj, entry->length);
- goto read_done;
- }
-
- /*
- * zs_obj_read_begin() might return a kmap address of highmem when
- * acomp_ctx->buffer is not used. However, sg_init_one() does not
- * handle highmem addresses, so copy the object to acomp_ctx->buffer.
- */
- if (virt_addr_valid(obj)) {
- src = obj;
+ WARN_ON_ONCE(input->length != PAGE_SIZE);
+ memcpy_from_sglist(kmap_local_folio(folio, 0), input, 0, PAGE_SIZE);
+ dlen = PAGE_SIZE;
} else {
- WARN_ON_ONCE(obj == acomp_ctx->buffer);
- memcpy(acomp_ctx->buffer, obj, entry->length);
- src = acomp_ctx->buffer;
+ sg_init_table(&output, 1);
+ sg_set_folio(&output, folio, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, input, &output,
+ entry->length, PAGE_SIZE);
+ ret = crypto_acomp_decompress(acomp_ctx->req);
+ ret = crypto_wait_req(ret, &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
}
- sg_init_one(&input, src, entry->length);
- sg_init_table(&output, 1);
- sg_set_folio(&output, folio, PAGE_SIZE, 0);
- acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
- decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
-
-read_done:
- zs_obj_read_end(pool->zs_pool, entry->handle, obj);
+ zs_obj_read_sg_end(pool->zs_pool, entry->handle);
acomp_ctx_put_unlock(acomp_ctx);
- if (!decomp_ret && dlen == PAGE_SIZE)
+ if (!ret && dlen == PAGE_SIZE)
return true;
zswap_decompress_fail++;
pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n",
swp_type(entry->swpentry),
swp_offset(entry->swpentry),
- entry->pool->tfm_name, entry->length, dlen);
+ entry->pool->tfm_name,
+ entry->length, dlen);
return false;
}
@@ -1014,8 +1002,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
return -EEXIST;
mpol = get_task_policy(current);
- folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
- NO_INTERLEAVE_INDEX, &folio_was_allocated, true);
+ folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol,
+ NO_INTERLEAVE_INDEX, &folio_was_allocated);
put_swap_device(si);
if (!folio)
return -ENOMEM;