diff options
Diffstat (limited to 'mm')
87 files changed, 3376 insertions, 2510 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index fbac1dfc9943..ebd8ea353687 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -582,23 +582,20 @@ config SPLIT_PMD_PTLOCKS # # support for memory balloon -config MEMORY_BALLOON +config BALLOON bool # -# support for memory balloon compaction -config BALLOON_COMPACTION - bool "Allow for balloon memory compaction/migration" +# support for memory balloon page migration +config BALLOON_MIGRATION + bool "Allow for balloon memory migration" default y - depends on COMPACTION && MEMORY_BALLOON + depends on MIGRATION && BALLOON help - Memory fragmentation introduced by ballooning might reduce - significantly the number of 2MB contiguous memory blocks that can be - used within a guest, thus imposing performance penalties associated - with the reduced number of transparent huge pages that could be used - by the guest workload. Allowing the compaction & migration for memory - pages enlisted as being part of memory balloon devices avoids the - scenario aforementioned and helps improving memory defragmentation. + Allow for migration of pages inflated in a memory balloon such that + they can be allocated from memory areas only available for movable + allocations (e.g., ZONE_MOVABLE, CMA) and such that they can be + migrated for memory defragmentation purposes by memory compaction. # # support for memory compaction @@ -1440,14 +1437,12 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). -config ARCH_SUPPORTS_PT_RECLAIM +config HAVE_ARCH_TLB_REMOVE_TABLE def_bool n config PT_RECLAIM - bool "reclaim empty user page table pages" - default y - depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP - select MMU_GATHER_RCU_TABLE_FREE + def_bool y + depends on MMU_GATHER_RCU_TABLE_FREE && !HAVE_ARCH_TLB_REMOVE_TABLE help Try to reclaim empty user page table pages in paths other than munmap and exit_mmap path. @@ -1457,6 +1452,25 @@ config PT_RECLAIM config FIND_NORMAL_PAGE def_bool n +config ARCH_HAS_LAZY_MMU_MODE + bool + help + The architecture uses the lazy MMU mode. This allows changes to + MMU-related architectural state to be deferred until the mode is + exited. See <linux/pgtable.h> for details. + +config LAZY_MMU_MODE_KUNIT_TEST + tristate "KUnit tests for the lazy MMU mode" if !KUNIT_ALL_TESTS + depends on ARCH_HAS_LAZY_MMU_MODE + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to check that the lazy MMU mode interface behaves + as expected. Only tests for the generic interface are included (not + architecture-specific behaviours). + + If unsure, say N. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index bf46fe31dc14..fd30164933a5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -125,7 +125,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_BALLOON) += balloon.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o @@ -149,4 +149,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o -obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o +obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c5740c6d37a2..e319bd5e8b75 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -939,7 +939,7 @@ void wb_memcg_offline(struct mem_cgroup *memcg) memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); - queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); + queue_work(system_dfl_wq, &cleanup_offline_cgwbs_work); } /** @@ -971,10 +971,10 @@ static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming - * system_wq. Put them in a separate wq and limit concurrency. + * system_percpu_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ - cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); + cgwb_release_wq = alloc_workqueue("cgwb_release", WQ_PERCPU, 1); if (!cgwb_release_wq) return -ENOMEM; @@ -1034,7 +1034,6 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; - timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -1156,8 +1155,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { - timer_delete_sync(&bdi->laptop_mode_wb_timer); - /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/balloon_compaction.c b/mm/balloon.c index 03c5dbabb156..96a8f1e20bc6 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon.c @@ -1,28 +1,62 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * mm/balloon_compaction.c - * - * Common interface for making balloon pages movable by compaction. + * Common interface for implementing a memory balloon, including support + * for migration of pages inflated in a memory balloon. * * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com> */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/export.h> -#include <linux/balloon_compaction.h> +#include <linux/balloon.h> + +/* + * Lock protecting the balloon_dev_info of all devices. We don't really + * expect more than one device. + */ +static DEFINE_SPINLOCK(balloon_pages_lock); + +/** + * balloon_page_insert - insert a page into the balloon's page list and make + * the page->private assignment accordingly. + * @balloon : pointer to balloon device + * @page : page to be assigned as a 'balloon page' + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_insert(struct balloon_dev_info *balloon, + struct page *page) +{ + lockdep_assert_held(&balloon_pages_lock); + __SetPageOffline(page); + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) { + SetPageMovableOps(page); + set_page_private(page, (unsigned long)balloon); + } + list_add(&page->lru, &balloon->pages); +} + +/** + * balloon_page_finalize - prepare a balloon page that was removed from the + * balloon list for release to the page allocator + * @page: page to be released to the page allocator + * + * Caller must ensure the balloon_pages_lock is held. + */ +static void balloon_page_finalize(struct page *page) +{ + lockdep_assert_held(&balloon_pages_lock); + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) + set_page_private(page, 0); + /* PageOffline is sticky until the page is freed to the buddy. */ +} static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, struct page *page) { - /* - * Block others from accessing the 'page' when we get around to - * establishing additional references. We should be the only one - * holding a reference to the 'page' at this point. If we are not, then - * memory corruption is possible and we should stop execution. - */ - BUG_ON(!trylock_page(page)); balloon_page_insert(b_dev_info, page); - unlock_page(page); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, -1); __count_vm_event(BALLOON_INFLATE); inc_node_page_state(page, NR_BALLOON_PAGES); } @@ -45,13 +79,13 @@ size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, unsigned long flags; size_t n_pages = 0; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_for_each_entry_safe(page, tmp, pages, lru) { list_del(&page->lru); balloon_page_enqueue_one(b_dev_info, page); n_pages++; } - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return n_pages; } EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); @@ -81,34 +115,26 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, unsigned long flags; size_t n_pages = 0; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { if (n_pages == n_req_pages) break; - - /* - * Block others from accessing the 'page' while we get around to - * establishing additional references and preparing the 'page' - * to be released by the balloon driver. - */ - if (!trylock_page(page)) - continue; - list_del(&page->lru); + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); balloon_page_finalize(page); __count_vm_event(BALLOON_DEFLATE); list_add(&page->lru, pages); - unlock_page(page); dec_node_page_state(page, NR_BALLOON_PAGES); n_pages++; } - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return n_pages; } EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); -/* +/** * balloon_page_alloc - allocates a new page for insertion into the balloon * page list. * @@ -120,14 +146,18 @@ EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); */ struct page *balloon_page_alloc(void) { - struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN); - return page; + gfp_t gfp_flags = __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + if (IS_ENABLED(CONFIG_BALLOON_MIGRATION)) + gfp_flags |= GFP_HIGHUSER_MOVABLE; + else + gfp_flags |= GFP_HIGHUSER; + + return alloc_page(gfp_flags); } EXPORT_SYMBOL_GPL(balloon_page_alloc); -/* +/** * balloon_page_enqueue - inserts a new page into the balloon page list. * * @b_dev_info: balloon device descriptor where we will insert a new page @@ -136,22 +166,21 @@ EXPORT_SYMBOL_GPL(balloon_page_alloc); * Drivers must call this function to properly enqueue a new allocated balloon * page before definitively removing the page from the guest system. * - * Drivers must not call balloon_page_enqueue on pages that have been pushed to - * a list with balloon_page_push before removing them with balloon_page_pop. To - * enqueue a list of pages, use balloon_page_list_enqueue instead. + * Drivers must not enqueue pages while page->lru is still in + * use, and must not use page->lru until a page was unqueued again. */ void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, struct page *page) { unsigned long flags; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); balloon_page_enqueue_one(b_dev_info, page); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); } EXPORT_SYMBOL_GPL(balloon_page_enqueue); -/* +/** * balloon_page_dequeue - removes a page from balloon's page list and returns * its address to allow the driver to release the page. * @b_dev_info: balloon device descriptor where we will grab a page from. @@ -187,32 +216,42 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * BUG() here, otherwise the balloon driver may get stuck in * an infinite loop while attempting to release all its pages. */ - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); if (unlikely(list_empty(&b_dev_info->pages) && !b_dev_info->isolated_pages)) BUG(); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return NULL; } return list_first_entry(&pages, struct page, lru); } EXPORT_SYMBOL_GPL(balloon_page_dequeue); -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION +static struct balloon_dev_info *balloon_page_device(struct page *page) +{ + return (struct balloon_dev_info *)page_private(page); +} static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { - struct balloon_dev_info *b_dev_info = balloon_page_device(page); + struct balloon_dev_info *b_dev_info; unsigned long flags; - if (!b_dev_info) + spin_lock_irqsave(&balloon_pages_lock, flags); + b_dev_info = balloon_page_device(page); + if (!b_dev_info) { + /* + * The page already got deflated and removed from the + * balloon list. + */ + spin_unlock_irqrestore(&balloon_pages_lock, flags); return false; - - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + } list_del(&page->lru); b_dev_info->isolated_pages++; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); return true; } @@ -222,33 +261,75 @@ static void balloon_page_putback(struct page *page) struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; - /* Isolated balloon pages cannot get deflated. */ + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ if (WARN_ON_ONCE(!b_dev_info)) return; - spin_lock_irqsave(&b_dev_info->pages_lock, flags); + spin_lock_irqsave(&balloon_pages_lock, flags); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + spin_unlock_irqrestore(&balloon_pages_lock, flags); } -/* move_to_new_page() counterpart for a ballooned page */ static int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { - struct balloon_dev_info *balloon = balloon_page_device(page); - - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + int rc; - /* Isolated balloon pages cannot get deflated. */ - if (WARN_ON_ONCE(!balloon)) + /* + * When we isolated the page, the page was still inflated in a balloon + * device. As isolated balloon pages cannot get deflated, we still have + * a balloon device here. + */ + if (WARN_ON_ONCE(!b_dev_info)) return -EAGAIN; - return balloon->migratepage(balloon, newpage, page, mode); + rc = b_dev_info->migratepage(b_dev_info, newpage, page, mode); + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&balloon_pages_lock, flags); + if (!rc) { + /* Insert the new page into the balloon list. */ + get_page(newpage); + balloon_page_insert(b_dev_info, newpage); + __count_vm_event(BALLOON_MIGRATE); + + if (b_dev_info->adjust_managed_page_count && + page_zone(page) != page_zone(newpage)) { + /* + * When we migrate a page to a different zone we + * have to fixup the count of both involved zones. + */ + adjust_managed_page_count(page, 1); + adjust_managed_page_count(newpage, -1); + } + } else { + /* Old page was deflated but new page not inflated. */ + __count_vm_event(BALLOON_DEFLATE); + + if (b_dev_info->adjust_managed_page_count) + adjust_managed_page_count(page, 1); + } + + b_dev_info->isolated_pages--; + + /* Free the now-deflated page we isolated in balloon_page_isolate(). */ + balloon_page_finalize(page); + spin_unlock_irqrestore(&balloon_pages_lock, flags); + + put_page(page); + + return 0; } -const struct movable_operations balloon_mops = { +static const struct movable_operations balloon_mops = { .migrate_page = balloon_page_migrate, .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, @@ -260,4 +341,4 @@ static int __init balloon_init(void) } core_initcall(balloon_init); -#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* CONFIG_BALLOON_MIGRATION */ @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/sizes.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/string_choices.h> #include <linux/log2.h> #include <linux/cma.h> @@ -233,7 +234,7 @@ static int __init cma_new_area(const char *name, phys_addr_t size, cma_area_count++; if (name) - snprintf(cma->name, CMA_MAX_NAME, "%s", name); + strscpy(cma->name, name); else snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); @@ -836,7 +837,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); mutex_lock(&cma->alloc_mutex); - ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); + ret = alloc_contig_frozen_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); if (!ret) break; @@ -856,8 +857,8 @@ out: return ret; } -static struct page *__cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, gfp_t gfp) +static struct page *__cma_alloc_frozen(struct cma *cma, + unsigned long count, unsigned int align, gfp_t gfp) { struct page *page = NULL; int ret = -ENOMEM, r; @@ -914,6 +915,21 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, return page; } +struct page *cma_alloc_frozen(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) +{ + gfp_t gfp = GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0); + + return __cma_alloc_frozen(cma, count, align, gfp); +} + +struct page *cma_alloc_frozen_compound(struct cma *cma, unsigned int order) +{ + gfp_t gfp = GFP_KERNEL | __GFP_COMP | __GFP_NOWARN; + + return __cma_alloc_frozen(cma, 1 << order, order, gfp); +} + /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. @@ -927,49 +943,60 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn) { - return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); -} - -struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) -{ struct page *page; - if (WARN_ON(!order || !(gfp & __GFP_COMP))) - return NULL; - - page = __cma_alloc(cma, 1 << order, order, gfp); + page = cma_alloc_frozen(cma, count, align, no_warn); + if (page) + set_pages_refcounted(page, count); - return page ? page_folio(page) : NULL; + return page; } -bool cma_pages_valid(struct cma *cma, const struct page *pages, - unsigned long count) +static struct cma_memrange *find_cma_memrange(struct cma *cma, + const struct page *pages, unsigned long count) { - unsigned long pfn, end; + struct cma_memrange *cmr = NULL; + unsigned long pfn, end_pfn; int r; - struct cma_memrange *cmr; - bool ret; + + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); if (!cma || !pages || count > cma->count) - return false; + return NULL; pfn = page_to_pfn(pages); - ret = false; for (r = 0; r < cma->nranges; r++) { cmr = &cma->ranges[r]; - end = cmr->base_pfn + cmr->count; - if (pfn >= cmr->base_pfn && pfn < end) { - ret = pfn + count <= end; - break; + end_pfn = cmr->base_pfn + cmr->count; + if (pfn >= cmr->base_pfn && pfn < end_pfn) { + if (pfn + count <= end_pfn) + break; + + VM_WARN_ON_ONCE(1); } } - if (!ret) - pr_debug("%s(page %p, count %lu)\n", - __func__, (void *)pages, count); + if (r == cma->nranges) { + pr_debug("%s(page %p, count %lu, no cma range matches the page range)\n", + __func__, (void *)pages, count); + return NULL; + } - return ret; + return cmr; +} + +static void __cma_release_frozen(struct cma *cma, struct cma_memrange *cmr, + const struct page *pages, unsigned long count) +{ + unsigned long pfn = page_to_pfn(pages); + + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); + + free_contig_frozen_range(pfn, count); + cma_clear_bitmap(cma, cmr, pfn, count); + cma_sysfs_account_release_pages(cma, count); + trace_cma_release(cma->name, pfn, pages, count); } /** @@ -986,43 +1013,33 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned long count) { struct cma_memrange *cmr; - unsigned long pfn, end_pfn; - int r; + unsigned long i, pfn; - pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); - - if (!cma_pages_valid(cma, pages, count)) + cmr = find_cma_memrange(cma, pages, count); + if (!cmr) return false; pfn = page_to_pfn(pages); - end_pfn = pfn + count; + for (i = 0; i < count; i++, pfn++) + VM_WARN_ON(!put_page_testzero(pfn_to_page(pfn))); - for (r = 0; r < cma->nranges; r++) { - cmr = &cma->ranges[r]; - if (pfn >= cmr->base_pfn && - pfn < (cmr->base_pfn + cmr->count)) { - VM_BUG_ON(end_pfn > cmr->base_pfn + cmr->count); - break; - } - } - - if (r == cma->nranges) - return false; - - free_contig_range(pfn, count); - cma_clear_bitmap(cma, cmr, pfn, count); - cma_sysfs_account_release_pages(cma, count); - trace_cma_release(cma->name, pfn, pages, count); + __cma_release_frozen(cma, cmr, pages, count); return true; } -bool cma_free_folio(struct cma *cma, const struct folio *folio) +bool cma_release_frozen(struct cma *cma, const struct page *pages, + unsigned long count) { - if (WARN_ON(!folio_test_large(folio))) + struct cma_memrange *cmr; + + cmr = find_cma_memrange(cma, pages, count); + if (!cmr) return false; - return cma_release(cma, &folio->page, folio_nr_pages(folio)); + __cma_release_frozen(cma, cmr, pages, count); + + return true; } int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) diff --git a/mm/damon/core.c b/mm/damon/core.c index 84f80a20f233..5e2724a4f285 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -157,6 +157,12 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t) damon_free_region(r); } +static bool damon_is_last_region(struct damon_region *r, + struct damon_target *t) +{ + return list_is_last(&r->list, &t->regions_list); +} + /* * Check whether a region is intersecting an address range * @@ -197,7 +203,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. - * @min_sz_region: minimum region size. + * @min_region_sz: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -205,7 +211,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges, unsigned long min_sz_region) + unsigned int nr_ranges, unsigned long min_region_sz) { struct damon_region *r, *next; unsigned int i; @@ -242,16 +248,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - min_sz_region), - ALIGN(range->end, min_sz_region)); + min_region_sz), + ALIGN(range->end, min_region_sz)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - min_sz_region); - last->ar.end = ALIGN(range->end, min_sz_region); + min_region_sz); + last->ar.end = ALIGN(range->end, min_region_sz); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -278,7 +284,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, } /** - * damos_filter_for_ops() - Return if the filter is ops-hndled one. + * damos_filter_for_ops() - Return if the filter is ops-handled one. * @type: type of the filter. * * Return: true if the filter of @type needs to be handled by ops layer, false @@ -395,6 +401,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, INIT_LIST_HEAD(&scheme->core_filters); INIT_LIST_HEAD(&scheme->ops_filters); scheme->stat = (struct damos_stat){}; + scheme->max_nr_snapshots = 0; INIT_LIST_HEAD(&scheme->list); scheme->quota = *(damos_quota_init(quota)); @@ -546,7 +553,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.max_nr_regions = 1000; ctx->addr_unit = 1; - ctx->min_sz_region = DAMON_MIN_REGION; + ctx->min_region_sz = DAMON_MIN_REGION_SZ; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1072,7 +1079,11 @@ static int damos_commit(struct damos *dst, struct damos *src) return err; err = damos_commit_filters(dst, src); - return err; + if (err) + return err; + + dst->max_nr_snapshots = src->max_nr_snapshots; + return 0; } static int damon_commit_schemes(struct damon_ctx *dst, struct damon_ctx *src) @@ -1131,7 +1142,7 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * If @src has no region, @dst keeps current regions. */ static int damon_commit_target_regions(struct damon_target *dst, - struct damon_target *src, unsigned long src_min_sz_region) + struct damon_target *src, unsigned long src_min_region_sz) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1148,7 +1159,7 @@ static int damon_commit_target_regions(struct damon_target *dst, i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i, src_min_sz_region); + err = damon_set_regions(dst, ranges, i, src_min_region_sz); kfree(ranges); return err; } @@ -1156,11 +1167,11 @@ static int damon_commit_target_regions(struct damon_target *dst, static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, struct damon_target *src, bool src_has_pid, - unsigned long src_min_sz_region) + unsigned long src_min_region_sz) { int err; - err = damon_commit_target_regions(dst, src, src_min_sz_region); + err = damon_commit_target_regions(dst, src, src_min_region_sz); if (err) return err; if (dst_has_pid) @@ -1187,7 +1198,7 @@ static int damon_commit_targets( err = damon_commit_target( dst_target, damon_target_has_pid(dst), src_target, damon_target_has_pid(src), - src->min_sz_region); + src->min_region_sz); if (err) return err; } else { @@ -1214,7 +1225,7 @@ static int damon_commit_targets( return -ENOMEM; err = damon_commit_target(new_target, false, src_target, damon_target_has_pid(src), - src->min_sz_region); + src->min_region_sz); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1261,7 +1272,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) } dst->ops = src->ops; dst->addr_unit = src->addr_unit; - dst->min_sz_region = src->min_sz_region; + dst->min_region_sz = src->min_region_sz; return 0; } @@ -1294,8 +1305,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < ctx->min_sz_region) - sz = ctx->min_sz_region; + if (sz < ctx->min_region_sz) + sz = ctx->min_region_sz; return sz; } @@ -1431,6 +1442,23 @@ bool damon_is_running(struct damon_ctx *ctx) return running; } +/** + * damon_kdamond_pid() - Return pid of a given DAMON context's worker thread. + * @ctx: The DAMON context of the question. + * + * Return: pid if @ctx is running, negative error code otherwise. + */ +int damon_kdamond_pid(struct damon_ctx *ctx) +{ + int pid = -EINVAL; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + pid = ctx->kdamond->pid; + mutex_unlock(&ctx->kdamond_lock); + return pid; +} + /* * damon_call_handle_inactive_ctx() - handle DAMON call request that added to * an inactive context. @@ -1604,7 +1632,7 @@ static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c) adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) / 10000; /* - * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of + * adaptation_bp ranges from 1 to 20,000. Avoid too rapid reduction of * the intervals by rescaling [1,10,000] to [5000, 10,000]. */ if (adaptation_bp <= 10000) @@ -1668,7 +1696,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. - * @min_sz_region: minimum region size. + * @min_region_sz: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1686,7 +1714,8 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s, unsigned long min_sz_region) + struct damon_region **rp, struct damos *s, + unsigned long min_region_sz) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1708,11 +1737,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, min_sz_region); + r->ar.start, min_region_sz); if (!sz_to_skip) { - if (damon_sz_region(r) <= min_sz_region) + if (damon_sz_region(r) <= min_region_sz) return true; - sz_to_skip = min_sz_region; + sz_to_skip = min_region_sz; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1738,7 +1767,7 @@ static void damos_update_stat(struct damos *s, static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos_filter *filter, - unsigned long min_sz_region) + unsigned long min_region_sz) { bool matched = false; struct damon_target *ti; @@ -1755,8 +1784,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); - end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); + start = ALIGN_DOWN(filter->addr_range.start, min_region_sz); + end = ALIGN_DOWN(filter->addr_range.end, min_region_sz); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1785,14 +1814,14 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, return matched == filter->matching; } -static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, +static bool damos_core_filter_out(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *s) { struct damos_filter *filter; s->core_filters_allowed = false; damos_for_each_core_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_region_sz)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1927,12 +1956,12 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - c->min_sz_region); + c->min_region_sz); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); } - if (damos_filter_out(c, t, r, s)) + if (damos_core_filter_out(c, t, r, s)) return; ktime_get_coarse_ts64(&begin); trace_damos_before_apply(cidx, sidx, tidx, r, @@ -1975,13 +2004,18 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) + if (damos_skip_charged_region(t, &r, s, c->min_region_sz)) continue; - if (!damos_valid_target(c, t, r, s)) + if (s->max_nr_snapshots && + s->max_nr_snapshots <= s->stat.nr_snapshots) continue; - damos_apply_scheme(c, t, r, s); + if (damos_valid_target(c, t, r, s)) + damos_apply_scheme(c, t, r, s); + + if (damon_is_last_region(r, t)) + s->stat.nr_snapshots++; } } @@ -2078,16 +2112,13 @@ static unsigned long damos_get_node_memcg_used_bp( unsigned long used_pages, numerator; struct sysinfo i; - rcu_read_lock(); - memcg = mem_cgroup_from_id(goal->memcg_id); - if (!memcg || !mem_cgroup_tryget(memcg)) { - rcu_read_unlock(); + memcg = mem_cgroup_get_from_id(goal->memcg_id); + if (!memcg) { if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ return 10000; } - rcu_read_unlock(); mem_cgroup_flush_stats(memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); @@ -2119,6 +2150,23 @@ static unsigned long damos_get_node_memcg_used_bp( } #endif +/* + * Returns LRU-active or inactive memory to total LRU memory size ratio. + */ +static unsigned int damos_get_in_active_mem_bp(bool active_ratio) +{ + unsigned long active, inactive, total; + + /* This should align with /proc/meminfo output */ + active = global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON) + + global_node_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE); + inactive = global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON) + + global_node_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE); + total = active + inactive; + if (active_ratio) + return active * 10000 / total; + return inactive * 10000 / total; +} static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) { @@ -2141,6 +2189,11 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal) case DAMOS_QUOTA_NODE_MEMCG_FREE_BP: goal->current_value = damos_get_node_memcg_used_bp(goal); break; + case DAMOS_QUOTA_ACTIVE_MEM_BP: + case DAMOS_QUOTA_INACTIVE_MEM_BP: + goal->current_value = damos_get_in_active_mem_bp( + goal->metric == DAMOS_QUOTA_ACTIVE_MEM_BP); + break; default: break; } @@ -2273,6 +2326,22 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) quota->min_score = score; } +static void damos_trace_stat(struct damon_ctx *c, struct damos *s) +{ + unsigned int cidx = 0, sidx = 0; + struct damos *siter; + + if (!trace_damos_stat_after_apply_interval_enabled()) + return; + + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + trace_damos_stat_after_apply_interval(cidx, sidx, &s->stat); +} + static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; @@ -2299,6 +2368,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c) mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { + if (c->ops.target_valid && c->ops.target_valid(t) == false) + continue; + damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); } @@ -2311,6 +2383,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; s->last_applied = NULL; + damos_trace_stat(c, s); } mutex_unlock(&c->walk_control_lock); } @@ -2424,7 +2497,7 @@ static void damon_split_region_at(struct damon_target *t, /* Split every region in the given target into 'nr_subs' regions */ static void damon_split_regions_of(struct damon_target *t, int nr_subs, - unsigned long min_sz_region) + unsigned long min_region_sz) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2434,13 +2507,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs, sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * min_sz_region; i++) { + sz_region > 2 * min_region_sz; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, min_sz_region); + sz_region / 10, min_region_sz); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2480,7 +2553,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); + damon_split_regions_of(t, nr_subregions, ctx->min_region_sz); last_nr_regions = nr_regions; } @@ -2577,41 +2650,30 @@ static void kdamond_usleep(unsigned long usecs) */ static void kdamond_call(struct damon_ctx *ctx, bool cancel) { - struct damon_call_control *control; - LIST_HEAD(repeat_controls); - int ret = 0; + struct damon_call_control *control, *next; + LIST_HEAD(controls); - while (true) { - mutex_lock(&ctx->call_controls_lock); - control = list_first_entry_or_null(&ctx->call_controls, - struct damon_call_control, list); - mutex_unlock(&ctx->call_controls_lock); - if (!control) - break; - if (cancel) { + mutex_lock(&ctx->call_controls_lock); + list_splice_tail_init(&ctx->call_controls, &controls); + mutex_unlock(&ctx->call_controls_lock); + + list_for_each_entry_safe(control, next, &controls, list) { + if (!control->repeat || cancel) + list_del(&control->list); + + if (cancel) control->canceled = true; - } else { - ret = control->fn(control->data); - control->return_code = ret; - } - mutex_lock(&ctx->call_controls_lock); - list_del(&control->list); - mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) { + else + control->return_code = control->fn(control->data); + + if (!control->repeat) complete(&control->completion); - } else if (control->canceled && control->dealloc_on_cancel) { + else if (control->canceled && control->dealloc_on_cancel) kfree(control); - continue; - } else { - list_add(&control->list, &repeat_controls); - } } - control = list_first_entry_or_null(&repeat_controls, - struct damon_call_control, list); - if (!control || cancel) - return; + mutex_lock(&ctx->call_controls_lock); - list_add_tail(&control->list, &ctx->call_controls); + list_splice_tail(&controls, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); } @@ -2670,8 +2732,6 @@ static void kdamond_init_ctx(struct damon_ctx *ctx) static int kdamond_fn(void *data) { struct damon_ctx *ctx = data; - struct damon_target *t; - struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; @@ -2747,7 +2807,7 @@ static int kdamond_fn(void *data) * * Reset ->next_aggregation_sis to avoid that. * It will anyway correctly updated after this - * if caluse. + * if clause. */ ctx->next_aggregation_sis = next_aggregation_sis; @@ -2776,47 +2836,29 @@ static int kdamond_fn(void *data) } } done: - damon_for_each_target(t, ctx) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } + damon_destroy_targets(ctx); - if (ctx->ops.cleanup) - ctx->ops.cleanup(ctx); kfree(ctx->regions_score_histogram); kdamond_call(ctx, true); + damos_walk_cancel(ctx); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); - damos_walk_cancel(ctx); - mutex_lock(&damon_lock); nr_running_ctxs--; if (!nr_running_ctxs && running_exclusive_ctxs) running_exclusive_ctxs = false; mutex_unlock(&damon_lock); - damon_destroy_targets(ctx); return 0; } -/* - * struct damon_system_ram_region - System RAM resource address region of - * [@start, @end). - * @start: Start address of the region (inclusive). - * @end: End address of the region (exclusive). - */ -struct damon_system_ram_region { - unsigned long start; - unsigned long end; -}; - static int walk_system_ram(struct resource *res, void *arg) { - struct damon_system_ram_region *a = arg; + struct damon_addr_range *a = arg; if (a->end - a->start < resource_size(res)) { a->start = res->start; @@ -2833,7 +2875,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) { - struct damon_system_ram_region arg = {}; + struct damon_addr_range arg = {}; walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); if (arg.end <= arg.start) @@ -2850,7 +2892,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, * @t: The monitoring target to set the region. * @start: The pointer to the start address of the region. * @end: The pointer to the end address of the region. - * @min_sz_region: Minimum region size. + * @min_region_sz: Minimum region size. * * This function sets the region of @t as requested by @start and @end. If the * values of @start and @end are zero, however, this function finds the biggest @@ -2862,7 +2904,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start, */ int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end, - unsigned long min_sz_region) + unsigned long min_region_sz) { struct damon_addr_range addr_range; @@ -2875,7 +2917,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1, min_sz_region); + return damon_set_regions(t, &addr_range, 1, min_region_sz); } /* diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 49b4bc294f4e..7bc5c0b2aea3 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -34,7 +34,7 @@ static bool enabled __read_mostly; * * Input parameters that updated while DAMON_LRU_SORT is running are not * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT - * reads values of parametrs except ``enabled`` again. Once the re-reading is + * reads values of parameters except ``enabled`` again. Once the re-reading is * done, this parameter is set as ``N``. If invalid parameters are found while * the re-reading, DAMON_LRU_SORT will be disabled. */ @@ -42,6 +42,49 @@ static bool commit_inputs __read_mostly; module_param(commit_inputs, bool, 0600); /* + * Desired active to [in]active memory ratio in bp (1/10,000). + * + * While keeping the caps that set by other quotas, DAMON_LRU_SORT + * automatically increases and decreases the effective level of the quota + * aiming the LRU [de]prioritizations of the hot and cold memory resulting in + * this active to [in]active memory ratio. Value zero means disabling this + * auto-tuning feature. + * + * Disabled by default. + */ +static unsigned long active_mem_bp __read_mostly; +module_param(active_mem_bp, ulong, 0600); + +/* + * Auto-tune monitoring intervals. + * + * If this parameter is set as ``Y``, DAMON_LRU_SORT automatically tunes + * DAMON's sampling and aggregation intervals. The auto-tuning aims to capture + * meaningful amount of access events in each DAMON-snapshot, while keeping the + * sampling interval 5 milliseconds in minimum, and 10 seconds in maximum. + * Setting this as ``N`` disables the auto-tuning. + * + * Disabled by default. + */ +static bool autotune_monitoring_intervals __read_mostly; +module_param(autotune_monitoring_intervals, bool, 0600); + +/* + * Filter [non-]young pages accordingly for LRU [de]prioritizations. + * + * If this is set, check page level access (youngness) once again before each + * LRU [de]prioritization operation. LRU prioritization operation is skipped + * if the page has not accessed since the last check (not young). LRU + * deprioritization operation is skipped if the page has accessed since the + * last check (young). The feature is enabled or disabled if this parameter is + * set as ``Y`` or ``N``, respectively. + * + * Disabled by default. + */ +static bool filter_young_pages __read_mostly; +module_param(filter_young_pages, bool, 0600); + +/* * Access frequency threshold for hot memory regions identification in permil. * * If a memory region is accessed in frequency of this or higher, @@ -71,7 +114,7 @@ static struct damos_quota damon_lru_sort_quota = { /* Within the quota, mark hotter regions accessed first. */ .weight_sz = 0, .weight_nr_accesses = 1, - .weight_age = 0, + .weight_age = 1, }; DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); @@ -193,10 +236,53 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } +static int damon_lru_sort_add_quota_goals(struct damos *hot_scheme, + struct damos *cold_scheme) +{ + struct damos_quota_goal *goal; + + if (!active_mem_bp) + return 0; + goal = damos_new_quota_goal(DAMOS_QUOTA_ACTIVE_MEM_BP, active_mem_bp); + if (!goal) + return -ENOMEM; + damos_add_quota_goal(&hot_scheme->quota, goal); + /* aim 0.2 % goal conflict, to keep little ping pong */ + goal = damos_new_quota_goal(DAMOS_QUOTA_INACTIVE_MEM_BP, + 10000 - active_mem_bp + 2); + if (!goal) + return -ENOMEM; + damos_add_quota_goal(&cold_scheme->quota, goal); + return 0; +} + +static int damon_lru_sort_add_filters(struct damos *hot_scheme, + struct damos *cold_scheme) +{ + struct damos_filter *filter; + + if (!filter_young_pages) + return 0; + + /* disallow prioritizing not-young pages */ + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, false, false); + if (!filter) + return -ENOMEM; + damos_add_filter(hot_scheme, filter); + + /* disabllow de-prioritizing young pages */ + filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true, false); + if (!filter) + return -ENOMEM; + damos_add_filter(cold_scheme, filter); + return 0; +} + static int damon_lru_sort_apply_parameters(void) { struct damon_ctx *param_ctx; struct damon_target *param_target; + struct damon_attrs attrs; struct damos *hot_scheme, *cold_scheme; unsigned int hot_thres, cold_thres; int err; @@ -212,25 +298,34 @@ static int damon_lru_sort_apply_parameters(void) if (!monitor_region_start && !monitor_region_end) addr_unit = 1; param_ctx->addr_unit = addr_unit; - param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); if (!damon_lru_sort_mon_attrs.sample_interval) { err = -EINVAL; goto out; } - err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs); + attrs = damon_lru_sort_mon_attrs; + if (autotune_monitoring_intervals) { + attrs.sample_interval = 5000; + attrs.aggr_interval = 100000; + attrs.intervals_goal.access_bp = 40; + attrs.intervals_goal.aggrs = 3; + attrs.intervals_goal.min_sample_us = 5000; + attrs.intervals_goal.max_sample_us = 10 * 1000 * 1000; + } + err = damon_set_attrs(param_ctx, &attrs); if (err) goto out; err = -ENOMEM; - hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * + hot_thres = damon_max_nr_accesses(&attrs) * hot_thres_access_freq / 1000; hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!hot_scheme) goto out; - cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; + cold_thres = cold_min_age / attrs.aggr_interval; cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres); if (!cold_scheme) { damon_destroy_scheme(hot_scheme); @@ -240,10 +335,17 @@ static int damon_lru_sort_apply_parameters(void) damon_set_schemes(param_ctx, &hot_scheme, 1); damon_add_scheme(param_ctx, cold_scheme); + err = damon_lru_sort_add_quota_goals(hot_scheme, cold_scheme); + if (err) + goto out; + err = damon_lru_sort_add_filters(hot_scheme, cold_scheme); + if (err) + goto out; + err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, - param_ctx->min_sz_region); + param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); @@ -303,7 +405,9 @@ static int damon_lru_sort_turn(bool on) err = damon_start(&ctx, 1, true); if (err) return err; - kdamond_pid = ctx->kdamond->pid; + kdamond_pid = damon_kdamond_pid(ctx); + if (kdamond_pid < 0) + return kdamond_pid; return damon_call(ctx, &call_control); } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 07a8aead439e..9bfe48826840 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -156,7 +156,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; - struct folio *folio; + struct folio *folio = NULL; /* check access in page level again by default */ damos_for_each_ops_filter(filter, s) { @@ -206,13 +206,13 @@ put_folio: return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static inline unsigned long damon_pa_mark_accessed_or_deactivate( +static inline unsigned long damon_pa_de_activate( struct damon_region *r, unsigned long addr_unit, - struct damos *s, bool mark_accessed, + struct damos *s, bool activate, unsigned long *sz_filter_passed) { phys_addr_t addr, applied = 0; - struct folio *folio; + struct folio *folio = NULL; addr = damon_pa_phys_addr(r->ar.start, addr_unit); while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { @@ -227,8 +227,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( else *sz_filter_passed += folio_size(folio) / addr_unit; - if (mark_accessed) - folio_mark_accessed(folio); + if (activate) + folio_activate(folio); else folio_deactivate(folio); applied += folio_nr_pages(folio); @@ -240,20 +240,18 @@ put_folio: return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static unsigned long damon_pa_mark_accessed(struct damon_region *r, +static unsigned long damon_pa_activate_pages(struct damon_region *r, unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true, - sz_filter_passed); + return damon_pa_de_activate(r, addr_unit, s, true, sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false, - sz_filter_passed); + return damon_pa_de_activate(r, addr_unit, s, false, sz_filter_passed); } static unsigned long damon_pa_migrate(struct damon_region *r, @@ -262,7 +260,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, { phys_addr_t addr, applied; LIST_HEAD(folio_list); - struct folio *folio; + struct folio *folio = NULL; addr = damon_pa_phys_addr(r->ar.start, addr_unit); while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { @@ -295,7 +293,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, unsigned long *sz_filter_passed) { phys_addr_t addr; - struct folio *folio; + struct folio *folio = NULL; if (!damos_ops_has_filter(s)) return 0; @@ -327,7 +325,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, case DAMOS_PAGEOUT: return damon_pa_pageout(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, aunit, scheme, + return damon_pa_activate_pages(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: return damon_pa_deactivate_pages(r, aunit, scheme, @@ -375,7 +373,6 @@ static int __init damon_pa_initcall(void) .prepare_access_checks = damon_pa_prepare_access_checks, .check_accesses = damon_pa_check_accesses, .target_valid = NULL, - .cleanup = NULL, .apply_scheme = damon_pa_apply_scheme, .get_scheme_score = damon_pa_scheme_score, }; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 36a582e09eae..43d76f5bed44 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -34,7 +34,7 @@ static bool enabled __read_mostly; * * Input parameters that updated while DAMON_RECLAIM is running are not applied * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values - * of parametrs except ``enabled`` again. Once the re-reading is done, this + * of parameters except ``enabled`` again. Once the re-reading is done, this * parameter is set as ``N``. If invalid parameters are found while the * re-reading, DAMON_RECLAIM will be disabled. */ @@ -208,7 +208,7 @@ static int damon_reclaim_apply_parameters(void) if (!monitor_region_start && !monitor_region_end) addr_unit = 1; param_ctx->addr_unit = addr_unit; - param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + param_ctx->min_region_sz = max(DAMON_MIN_REGION_SZ / addr_unit, 1); if (!damon_reclaim_mon_attrs.aggr_interval) { err = -EINVAL; @@ -251,7 +251,7 @@ static int damon_reclaim_apply_parameters(void) err = damon_set_region_biggest_system_ram_default(param_target, &monitor_region_start, &monitor_region_end, - param_ctx->min_sz_region); + param_ctx->min_region_sz); if (err) goto out; err = damon_commit_ctx(ctx, param_ctx); @@ -307,7 +307,9 @@ static int damon_reclaim_turn(bool on) err = damon_start(&ctx, 1, true); if (err) return err; - kdamond_pid = ctx->kdamond->pid; + kdamond_pid = damon_kdamond_pid(ctx); + if (kdamond_pid < 0) + return kdamond_pid; return damon_call(ctx, &call_control); } diff --git a/mm/damon/stat.c b/mm/damon/stat.c index ed8e3629d31a..bcf6c8ae9b90 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Shows data access monitoring resutls in simple metrics. + * Shows data access monitoring results in simple metrics. */ #define pr_fmt(fmt) "damon-stat: " fmt @@ -34,7 +34,7 @@ module_param(estimated_memory_bandwidth, ulong, 0400); MODULE_PARM_DESC(estimated_memory_bandwidth, "Estimated memory bandwidth usage in bytes per second"); -static long memory_idle_ms_percentiles[101] __read_mostly = {0,}; +static long memory_idle_ms_percentiles[101] = {0,}; module_param_array(memory_idle_ms_percentiles, long, NULL, 0400); MODULE_PARM_DESC(memory_idle_ms_percentiles, "Memory idle time percentiles in milliseconds"); @@ -173,14 +173,6 @@ static struct damon_ctx *damon_stat_build_ctx(void) if (damon_set_attrs(ctx, &attrs)) goto free_out; - /* - * auto-tune sampling and aggregation interval aiming 4% DAMON-observed - * accesses ratio, keeping sampling interval in [5ms, 10s] range. - */ - ctx->attrs.intervals_goal = (struct damon_intervals_goal) { - .access_bp = 400, .aggrs = 3, - .min_sample_us = 5000, .max_sample_us = 10000000, - }; if (damon_select_ops(ctx, DAMON_OPS_PADDR)) goto free_out; @@ -189,7 +181,7 @@ static struct damon_ctx *damon_stat_build_ctx(void) goto free_out; damon_add_target(ctx, target); if (damon_set_region_biggest_system_ram_default(target, &start, &end, - ctx->min_sz_region)) + ctx->min_region_sz)) goto free_out; return ctx; free_out: diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3a699dcd5a7f..2b05a6477188 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -204,6 +204,8 @@ struct damon_sysfs_stats { unsigned long sz_applied; unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; + unsigned long nr_snapshots; + unsigned long max_nr_snapshots; }; static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) @@ -265,6 +267,37 @@ static ssize_t qt_exceeds_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); } +static ssize_t nr_snapshots_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_snapshots); +} + +static ssize_t max_nr_snapshots_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->max_nr_snapshots); +} + +static ssize_t max_nr_snapshots_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + unsigned long max_nr_snapshots, err = kstrtoul(buf, 0, &max_nr_snapshots); + + if (err) + return err; + stats->max_nr_snapshots = max_nr_snapshots; + return count; +} + static void damon_sysfs_stats_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); @@ -288,6 +321,12 @@ static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr = static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = __ATTR_RO_MODE(qt_exceeds, 0400); +static struct kobj_attribute damon_sysfs_stats_nr_snapshots_attr = + __ATTR_RO_MODE(nr_snapshots, 0400); + +static struct kobj_attribute damon_sysfs_stats_max_nr_snapshots_attr = + __ATTR_RW_MODE(max_nr_snapshots, 0600); + static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_nr_tried_attr.attr, &damon_sysfs_stats_sz_tried_attr.attr, @@ -295,6 +334,8 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_applied_attr.attr, &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, + &damon_sysfs_stats_nr_snapshots_attr.attr, + &damon_sysfs_stats_max_nr_snapshots_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_stats); @@ -1038,6 +1079,14 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = { .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP, .name = "node_memcg_free_bp", }, + { + .metric = DAMOS_QUOTA_ACTIVE_MEM_BP, + .name = "active_mem_bp", + }, + { + .metric = DAMOS_QUOTA_INACTIVE_MEM_BP, + .name = "inactive_mem_bp", + }, }; static ssize_t target_metric_show(struct kobject *kobj, @@ -2288,7 +2337,6 @@ static ssize_t target_nid_store(struct kobject *kobj, struct damon_sysfs_scheme, kobj); int err = 0; - /* TODO: error handling for target_nid range. */ err = kstrtoint(buf, 0, &scheme->target_nid); return err ? err : count; @@ -2454,7 +2502,7 @@ static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg, return false; } -static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) +static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) { struct mem_cgroup *memcg; char *path; @@ -2469,8 +2517,8 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id) for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg; memcg = mem_cgroup_iter(NULL, memcg, NULL)) { - /* skip removed memcg */ - if (!mem_cgroup_id(memcg)) + /* skip offlined memcg */ + if (!mem_cgroup_online(memcg)) continue; if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { *id = mem_cgroup_id(memcg); @@ -2719,6 +2767,7 @@ static struct damos *damon_sysfs_mk_scheme( damon_destroy_scheme(scheme); return NULL; } + scheme->max_nr_snapshots = sysfs_scheme->stats->max_nr_snapshots; return scheme; } @@ -2763,6 +2812,7 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->sz_ops_filter_passed = scheme->stat.sz_ops_filter_passed; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + sysfs_stats->nr_snapshots = scheme->stat.nr_snapshots; } } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 95fd9375a7d8..b7f66196bec4 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1365,7 +1365,7 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_regions *sysfs_regions, - unsigned long min_sz_region) + unsigned long min_region_sz) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1387,7 +1387,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_region_sz); out: kfree(ranges); return err; @@ -1409,7 +1409,8 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, return -EINVAL; } t->obsolete = sys_target->obsolete; - return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); + return damon_sysfs_set_regions(t, sys_target->regions, + ctx->min_region_sz); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1469,8 +1470,8 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, ctx->addr_unit = sys_ctx->addr_unit; /* addr_unit is respected by only DAMON_OPS_PADDR */ if (sys_ctx->ops_id == DAMON_OPS_PADDR) - ctx->min_sz_region = max( - DAMON_MIN_REGION / sys_ctx->addr_unit, 1); + ctx->min_region_sz = max( + DAMON_MIN_REGION_SZ / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; @@ -1819,10 +1820,9 @@ static ssize_t pid_show(struct kobject *kobj, if (!ctx) goto out; - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - pid = ctx->kdamond->pid; - mutex_unlock(&ctx->kdamond_lock); + pid = damon_kdamond_pid(ctx); + if (pid < 0) + pid = -1; out: mutex_unlock(&damon_sysfs_lock); return sysfs_emit(buf, "%d\n", pid); diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 8cb369b63e08..92ea25e2dc9e 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -158,6 +158,7 @@ static void damon_test_split_at(struct kunit *test) r->nr_accesses_bp = 420000; r->nr_accesses = 42; r->last_nr_accesses = 15; + r->age = 10; damon_add_region(r, t); damon_split_region_at(t, r, 25); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); @@ -170,6 +171,7 @@ static void damon_test_split_at(struct kunit *test) KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, r_new->nr_accesses_bp); KUNIT_EXPECT_EQ(test, r->nr_accesses, r_new->nr_accesses); KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses); + KUNIT_EXPECT_EQ(test, r->age, r_new->age); damon_free_target(t); } @@ -190,6 +192,7 @@ static void damon_test_merge_two(struct kunit *test) } r->nr_accesses = 10; r->nr_accesses_bp = 100000; + r->age = 9; damon_add_region(r, t); r2 = damon_new_region(100, 300); if (!r2) { @@ -198,12 +201,15 @@ static void damon_test_merge_two(struct kunit *test) } r2->nr_accesses = 20; r2->nr_accesses_bp = 200000; + r2->age = 21; damon_add_region(r2, t); damon_merge_two_regions(t, r, r2); KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + KUNIT_EXPECT_EQ(test, r->nr_accesses_bp, 160000u); + KUNIT_EXPECT_EQ(test, r->age, 17u); i = 0; damon_for_each_region(r3, t) { @@ -232,12 +238,12 @@ static void damon_test_merge_regions_of(struct kunit *test) { struct damon_target *t; struct damon_region *r; - unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; - unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; - unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184, 230}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230, 10170}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2, 5}; - unsigned long saddrs[] = {0, 114, 130, 156, 170}; - unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + unsigned long saddrs[] = {0, 114, 130, 156, 170, 230}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230, 10170}; int i; t = damon_new_target(); @@ -255,9 +261,9 @@ static void damon_test_merge_regions_of(struct kunit *test) } damon_merge_regions_of(t, 9, 9999); - /* 0-112, 114-130, 130-156, 156-170 */ - KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); - for (i = 0; i < 5; i++) { + /* 0-112, 114-130, 130-156, 156-170, 170-230, 230-10170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 6u); + for (i = 0; i < 6; i++) { r = __nth_region_of(t, i); KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); @@ -269,6 +275,9 @@ static void damon_test_split_regions_of(struct kunit *test) { struct damon_target *t; struct damon_region *r; + unsigned long sa[] = {0, 300, 500}; + unsigned long ea[] = {220, 400, 700}; + int i; t = damon_new_target(); if (!t) @@ -295,6 +304,23 @@ static void damon_test_split_regions_of(struct kunit *test) damon_split_regions_of(t, 4, 1); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); + + t = damon_new_target(); + if (!t) + kunit_skip(test, "third target alloc fail"); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + if (!r) { + damon_free_target(t); + kunit_skip(test, "region alloc fail"); + } + damon_add_region(r, t); + } + damon_split_regions_of(t, 4, 5); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 12u); + damon_for_each_region(r, t) + KUNIT_EXPECT_GE(test, damon_sz_region(r) % 5ul, 0ul); + damon_free_target(t); } static void damon_test_ops_registration(struct kunit *test) @@ -574,9 +600,10 @@ static void damos_test_commit_quota_goal(struct kunit *test) }); damos_test_commit_quota_goal_for(test, &dst, &(struct damos_quota_goal) { - .metric = DAMOS_QUOTA_USER_INPUT, - .target_value = 789, - .current_value = 12, + .metric = DAMOS_QUOTA_SOME_MEM_PSI_US, + .target_value = 234, + .current_value = 345, + .last_psi_total = 567, }); } @@ -1159,7 +1186,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test) damos_set_filters_default_reject(&scheme); /* * A core-handled allow-filter is installed. - * Rejct by default on core layer filtering stage due to the last + * Reject by default on core layer filtering stage due to the last * core-layer-filter's behavior. * Allow by default on ops layer filtering stage due to the absence of * ops layer filters. diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 30dc5459f1d2..cfae870178bf 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -147,7 +147,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 23ed738a0bd6..83ab3d8c3792 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -19,8 +19,8 @@ #include "ops-common.h" #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST -#undef DAMON_MIN_REGION -#define DAMON_MIN_REGION 1 +#undef DAMON_MIN_REGION_SZ +#define DAMON_MIN_REGION_SZ 1 #endif /* @@ -78,7 +78,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, orig_end = r->ar.end; sz_orig = damon_sz_region(r); - sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION_SZ); if (!sz_piece) return -EINVAL; @@ -161,12 +161,12 @@ next: swap(first_gap, second_gap); /* Store the result */ - regions[0].start = ALIGN(start, DAMON_MIN_REGION); - regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); - regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); - regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); - regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); - regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); + regions[0].start = ALIGN(start, DAMON_MIN_REGION_SZ); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION_SZ); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION_SZ); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION_SZ); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION_SZ); + regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION_SZ); return 0; } @@ -259,8 +259,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, sz += regions[i].end - regions[i].start; if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < DAMON_MIN_REGION_SZ) + sz = DAMON_MIN_REGION_SZ; /* Set the initial three regions of the target */ for (i = 0; i < 3; i++) { @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION_SZ); } } @@ -1014,7 +1014,6 @@ static int __init damon_va_initcall(void) .check_accesses = damon_va_check_accesses, .target_valid = damon_va_target_valid, .cleanup_target = damon_va_cleanup_target, - .cleanup = NULL, .apply_scheme = damon_va_apply_scheme, .get_scheme_score = damon_va_scheme_score, }; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ae9b9310d96f..83cf07269f13 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -971,22 +971,26 @@ static unsigned long __init get_random_vaddr(void) return random_vaddr; } -static void __init destroy_args(struct pgtable_debug_args *args) +static void __init +debug_vm_pgtable_free_huge_page(struct pgtable_debug_args *args, + unsigned long pfn, int order) { - struct page *page = NULL; +#ifdef CONFIG_CONTIG_ALLOC + if (args->is_contiguous_page) { + free_contig_range(pfn, 1 << order); + return; + } +#endif + __free_pages(pfn_to_page(pfn), order); +} +static void __init destroy_args(struct pgtable_debug_args *args) +{ /* Free (huge) page */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_pud_hugepage() && args->pud_pfn != ULONG_MAX) { - if (args->is_contiguous_page) { - free_contig_range(args->pud_pfn, - (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); - } else { - page = pfn_to_page(args->pud_pfn); - __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); - } - + debug_vm_pgtable_free_huge_page(args, args->pud_pfn, HPAGE_PUD_ORDER); args->pud_pfn = ULONG_MAX; args->pmd_pfn = ULONG_MAX; args->pte_pfn = ULONG_MAX; @@ -995,20 +999,13 @@ static void __init destroy_args(struct pgtable_debug_args *args) if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage() && args->pmd_pfn != ULONG_MAX) { - if (args->is_contiguous_page) { - free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); - } else { - page = pfn_to_page(args->pmd_pfn); - __free_pages(page, HPAGE_PMD_ORDER); - } - + debug_vm_pgtable_free_huge_page(args, args->pmd_pfn, HPAGE_PMD_ORDER); args->pmd_pfn = ULONG_MAX; args->pte_pfn = ULONG_MAX; } if (args->pte_pfn != ULONG_MAX) { - page = pfn_to_page(args->pte_pfn); - __free_page(page); + __free_page(pfn_to_page(args->pte_pfn)); args->pte_pfn = ULONG_MAX; } @@ -1242,8 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args) */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_pud_hugepage()) { - page = debug_vm_pgtable_alloc_huge_page(args, - HPAGE_PUD_SHIFT - PAGE_SHIFT); + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PUD_ORDER); if (page) { args->pud_pfn = page_to_pfn(page); args->pmd_pfn = args->pud_pfn; diff --git a/mm/dmapool_test.c b/mm/dmapool_test.c index 54b1fd1ccfbb..e8172d708308 100644 --- a/mm/dmapool_test.c +++ b/mm/dmapool_test.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/device.h> #include <linux/dma-map-ops.h> #include <linux/dma-mapping.h> diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index ff35b84a7b50..96c29b9dc85d 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -30,6 +30,14 @@ static int __init early_ioremap_debug_setup(char *str) } early_param("early_ioremap_debug", early_ioremap_debug_setup); +#define early_ioremap_dbg(fmt, args...) \ + do { \ + if (unlikely(early_ioremap_debug)) { \ + pr_warn(fmt, ##args); \ + dump_stack(); \ + } \ + } while (0) + static int after_paging_init __initdata; pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, @@ -139,6 +147,9 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) if (WARN_ON(nrpages > NR_FIX_BTMAPS)) return NULL; + early_ioremap_dbg("%s(%pa, %08lx) [%d] => %08lx + %08lx\n", + __func__, &phys_addr, size, slot, slot_virt[slot], offset); + /* * Ok, go for it.. */ @@ -152,8 +163,6 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) --idx; --nrpages; } - WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n", - __func__, &phys_addr, size, slot, offset, slot_virt[slot]); prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; @@ -184,8 +193,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) __func__, addr, size, slot, prev_size[slot])) return; - WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n", - __func__, addr, size, slot); + early_ioremap_dbg("%s(%p, %08lx) [%d]\n", __func__, addr, size, slot); virt_addr = (unsigned long)addr; if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 45540942d148..a02179a0bded 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Compatibility functions which bloat the callers too much to make inline. * All of the callers of these functions should be converted to use folios @@ -2806,17 +2806,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) return !reject_file_backed || shmem_mapping(mapping); } -static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, - unsigned int flags, struct page **pages) -{ - while ((*nr) - nr_start) { - struct folio *folio = page_folio(pages[--(*nr)]); - - folio_clear_referenced(folio); - gup_put_folio(folio, 1, flags); - } -} - #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * GUP-fast relies on pte change detection to avoid concurrent pgtable diff --git a/mm/gup_test.c b/mm/gup_test.c index eeb3f4d87c51..9dd48db897b9 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/mm.h> #include <linux/slab.h> diff --git a/mm/highmem.c b/mm/highmem.c index b5c8e4c2d5d4..a33e41183951 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -180,12 +180,13 @@ struct page *__kmap_to_page(void *vaddr) for (i = 0; i < kctrl->idx; i++) { unsigned long base_addr; int idx; + pte_t pteval = kctrl->pteval[i]; idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); if (base_addr == base) - return pte_page(kctrl->pteval[i]); + return pte_page(pteval); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a6d37902b73d..0d487649e4de 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3431,7 +3431,7 @@ static void remap_page(struct folio *folio, unsigned long nr, int flags) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, RMP_LOCKED | flags); + remove_migration_ptes(folio, folio, TTU_RMAP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; @@ -3944,7 +3944,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, int old_order = folio_order(folio); struct folio *new_folio, *next; int nr_shmem_dropped = 0; - int remap_flags = 0; + enum ttu_flags ttu_flags = 0; int ret; pgoff_t end = 0; @@ -4064,9 +4064,9 @@ fail: shmem_uncharge(mapping->host, nr_shmem_dropped); if (!ret && is_anon && !folio_is_device_private(folio)) - remap_flags = RMP_USE_SHARED_ZEROPAGE; + ttu_flags = TTU_USE_SHARED_ZEROPAGE; - remap_page(folio, 1 << old_order, remap_flags); + remap_page(folio, 1 << old_order, ttu_flags); /* * Unlock all after-split folios except the one containing diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1832da0f623..0b005e944ee3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -121,16 +121,6 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); -static void hugetlb_free_folio(struct folio *folio) -{ - if (folio_test_hugetlb_cma(folio)) { - hugetlb_cma_free_folio(folio); - return; - } - - folio_put(folio); -} - static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -588,8 +578,9 @@ hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); list_add(&nrg->link, rg); coalesce_file_region(map, nrg); - } else + } else { *regions_needed += 1; + } return to - from; } @@ -1257,8 +1248,9 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) if (vma_lock && vma_lock->vma != vma) vma->vm_private_data = NULL; - } else + } else { vma->vm_private_data = NULL; + } } /* @@ -1417,47 +1409,25 @@ err: return NULL; } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#ifdef CONFIG_CONTIG_ALLOC -static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, +#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && defined(CONFIG_CONTIG_ALLOC) +static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; - bool retried = false; - -retry: - folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); - if (!folio) { - if (hugetlb_cma_exclusive_alloc()) - return NULL; - - folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); - if (!folio) - return NULL; - } - if (folio_ref_freeze(folio, 1)) + folio = hugetlb_cma_alloc_frozen_folio(order, gfp_mask, nid, nodemask); + if (folio) return folio; - pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); - hugetlb_free_folio(folio); - if (!retried) { - retried = true; - goto retry; - } - return NULL; -} + if (hugetlb_cma_exclusive_alloc()) + return NULL; -#else /* !CONFIG_CONTIG_ALLOC */ -static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, - nodemask_t *nodemask) -{ - return NULL; + folio = (struct folio *)alloc_contig_frozen_pages(1 << order, gfp_mask, + nid, nodemask); + return folio; } -#endif /* CONFIG_CONTIG_ALLOC */ - -#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE || !CONFIG_CONTIG_ALLOC */ +static struct folio *alloc_gigantic_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1587,9 +1557,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); - folio_ref_unfreeze(folio, 1); - - hugetlb_free_folio(folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + if (folio_test_hugetlb_cma(folio)) + hugetlb_cma_free_frozen_folio(folio); + else + free_frozen_pages(&folio->page, folio_order(folio)); } /* @@ -1869,7 +1841,7 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) return NULL; } -static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, +static struct folio *alloc_buddy_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; @@ -1925,10 +1897,10 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, nid = numa_mem_id(); if (order_is_gigantic(order)) - folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); + folio = alloc_gigantic_frozen_folio(order, gfp_mask, nid, nmask); else - folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, - node_alloc_noretry); + folio = alloc_buddy_frozen_folio(order, gfp_mask, nid, nmask, + node_alloc_noretry); if (folio) init_new_hugetlb_folio(folio); return folio; @@ -2106,8 +2078,9 @@ retry: h->max_huge_pages++; goto out; } - } else + } else { rc = 0; + } update_and_free_hugetlb_folio(h, folio, false); return rc; @@ -2702,11 +2675,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * be consumed on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); - } else + } else { /* * No reservation present, do nothing */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, address); + } } } @@ -2836,23 +2810,62 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct folio *folio; + unsigned long nr = 0; + struct page *page; + struct hstate *h; + LIST_HEAD(list); int ret = 0; - LIST_HEAD(isolate_list); + /* Avoid pfn iterations if no free non-gigantic huge pages */ + for_each_hstate(h) { + if (hstate_is_gigantic(h)) + continue; + + nr += h->free_huge_pages; + if (nr) + break; + } + + if (!nr) + return 0; while (start_pfn < end_pfn) { - folio = pfn_folio(start_pfn); + page = pfn_to_page(start_pfn); + nr = 1; - /* Not to disrupt normal path by vainly holding hugetlb_lock */ - if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); - if (ret) - break; + if (PageHuge(page) || PageCompound(page)) { + struct folio *folio = page_folio(page); + + nr = folio_nr_pages(folio) - folio_page_idx(folio, page); + + /* + * Don't disrupt normal path by vainly holding + * hugetlb_lock + */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + if (order_is_gigantic(folio_order(folio))) { + ret = -ENOMEM; + break; + } + + ret = alloc_and_dissolve_hugetlb_folio(folio, &list); + if (ret) + break; + + putback_movable_pages(&list); + } + } else if (PageBuddy(page)) { + /* + * Buddy order check without zone lock is unsafe and + * the order is maybe invalid, but race should be + * small, and the worst thing is skipping free hugetlb. + */ + const unsigned int order = buddy_order_unsafe(page); - putback_movable_pages(&isolate_list); + if (order <= MAX_PAGE_ORDER) + nr = 1UL << order; } - start_pfn++; + start_pfn += nr; } return ret; @@ -3019,13 +3032,10 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); - if (map_chg) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd( - hstate_index(h), pages_per_huge_page(h), - folio); - spin_unlock_irq(&hugetlb_lock); - } + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), folio); + spin_unlock_irq(&hugetlb_lock); } } @@ -3425,6 +3435,13 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); + if (!folio && !list_empty(&folio_list) && + hugetlb_vmemmap_optimizable_size(h)) { + prep_and_add_allocated_folios(h, &folio_list); + INIT_LIST_HEAD(&folio_list); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); + } if (!folio) break; list_add(&folio->lru, &folio_list); @@ -4159,7 +4176,6 @@ static int __init hugetlb_init(void) } } - hugetlb_cma_check(); hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); @@ -4487,21 +4503,11 @@ void __init hugetlb_bootmem_set_nodes(void) } } -static bool __hugetlb_bootmem_allocated __initdata; - -bool __init hugetlb_bootmem_allocated(void) -{ - return __hugetlb_bootmem_allocated; -} - void __init hugetlb_bootmem_alloc(void) { struct hstate *h; int i; - if (__hugetlb_bootmem_allocated) - return; - hugetlb_bootmem_set_nodes(); for (i = 0; i < MAX_NUMNODES; i++) @@ -4515,8 +4521,6 @@ void __init hugetlb_bootmem_alloc(void) if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } - - __hugetlb_bootmem_allocated = true; } /* @@ -4718,10 +4722,12 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) if (vma_lock->vma != vma) { vma->vm_private_data = NULL; hugetlb_vma_lock_alloc(vma); - } else + } else { pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); - } else + } + } else { hugetlb_vma_lock_alloc(vma); + } } } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 58e895f3899a..792d06538fa9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * * Copyright IBM Corporation, 2012 @@ -7,14 +8,6 @@ * Copyright (C) 2019 Red Hat, Inc. * Author: Giuseppe Scrivano <gscrivan@redhat.com> * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ #include <linux/cgroup.h> @@ -822,7 +815,7 @@ hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { *cft = *tmpl; /* rebuild the name */ - snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); + scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); /* rebuild the private */ cft->private = MEMFILE_PRIVATE(idx, tmpl->private); /* rebuild the file_offset */ diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index e8e4dc7182d5..f83ae4998990 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -13,42 +13,46 @@ #include "hugetlb_cma.h" -static struct cma *hugetlb_cma[MAX_NUMNODES]; +static struct cma *hugetlb_cma[MAX_NUMNODES] __ro_after_init; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_only; -static unsigned long hugetlb_cma_size __initdata; +static bool hugetlb_cma_only __ro_after_init; +static unsigned long hugetlb_cma_size __ro_after_init; -void hugetlb_cma_free_folio(struct folio *folio) +void hugetlb_cma_free_frozen_folio(struct folio *folio) { - int nid = folio_nid(folio); - - WARN_ON_ONCE(!cma_free_folio(hugetlb_cma[nid], folio)); + WARN_ON_ONCE(!cma_release_frozen(hugetlb_cma[folio_nid(folio)], + &folio->page, folio_nr_pages(folio))); } - -struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { int node; - struct folio *folio = NULL; + struct folio *folio; + struct page *page = NULL; + + if (!hugetlb_cma_size) + return NULL; if (hugetlb_cma[nid]) - folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); + page = cma_alloc_frozen_compound(hugetlb_cma[nid], order); - if (!folio && !(gfp_mask & __GFP_THISNODE)) { + if (!page && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); - if (folio) + page = cma_alloc_frozen_compound(hugetlb_cma[node], order); + if (page) break; } } - if (folio) - folio_set_hugetlb_cma(folio); + if (!page) + return NULL; + folio = page_folio(page); + folio_set_hugetlb_cma(folio); return folio; } @@ -85,9 +89,6 @@ hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact) return m; } - -static bool cma_reserve_called __initdata; - static int __init cmdline_parse_hugetlb_cma(char *p) { int nid, count = 0; @@ -134,12 +135,26 @@ static int __init cmdline_parse_hugetlb_cma_only(char *p) early_param("hugetlb_cma_only", cmdline_parse_hugetlb_cma_only); -void __init hugetlb_cma_reserve(int order) +unsigned int __weak arch_hugetlb_cma_order(void) { - unsigned long size, reserved, per_node; + return 0; +} + +void __init hugetlb_cma_reserve(void) +{ + unsigned long size, reserved, per_node, order; bool node_specific_cma_alloc = false; int nid; + if (!hugetlb_cma_size) + return; + + order = arch_hugetlb_cma_order(); + if (!order) { + pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); + return; + } + /* * HugeTLB CMA reservation is required for gigantic * huge pages which could not be allocated via the @@ -147,10 +162,6 @@ void __init hugetlb_cma_reserve(int order) * breaking this assumption. */ VM_WARN_ON(order <= MAX_PAGE_ORDER); - cma_reserve_called = true; - - if (!hugetlb_cma_size) - return; hugetlb_bootmem_set_nodes(); @@ -244,14 +255,6 @@ void __init hugetlb_cma_reserve(int order) hugetlb_cma_size = 0; } -void __init hugetlb_cma_check(void) -{ - if (!hugetlb_cma_size || cma_reserve_called) - return; - - pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); -} - bool hugetlb_cma_exclusive_alloc(void) { return hugetlb_cma_only; diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index 2c2ec8a7e134..c619c394b1ae 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -3,23 +3,22 @@ #define _LINUX_HUGETLB_CMA_H #ifdef CONFIG_CMA -void hugetlb_cma_free_folio(struct folio *folio); -struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, +void hugetlb_cma_free_frozen_folio(struct folio *folio); +struct folio *hugetlb_cma_alloc_frozen_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); -void hugetlb_cma_check(void); bool hugetlb_cma_exclusive_alloc(void); unsigned long hugetlb_cma_total_size(void); void hugetlb_cma_validate_params(void); bool hugetlb_early_cma(struct hstate *h); #else -static inline void hugetlb_cma_free_folio(struct folio *folio) +static inline void hugetlb_cma_free_frozen_folio(struct folio *folio) { } -static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static inline struct folio *hugetlb_cma_alloc_frozen_folio(int order, + gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } @@ -31,10 +30,6 @@ struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, return NULL; } -static inline void hugetlb_cma_check(void) -{ -} - static inline bool hugetlb_cma_exclusive_alloc(void) { return false; diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c index bd3077150542..e74cf18ad431 100644 --- a/mm/hugetlb_sysctl.c +++ b/mm/hugetlb_sysctl.c @@ -8,6 +8,8 @@ #include "hugetlb_internal.h" +int movable_gigantic_pages; + #ifdef CONFIG_SYSCTL static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, @@ -125,6 +127,15 @@ static const struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + { + .procname = "movable_gigantic_pages", + .data = &movable_gigantic_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif }; void __init hugetlb_sysctl_init(void) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9d01f883fd71..a9280259e12a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -794,14 +794,6 @@ void __init hugetlb_vmemmap_init_early(int nid) struct huge_bootmem_page *m = NULL; void *map; - /* - * Noting to do if bootmem pages were not allocated - * early in boot, or if HVO wasn't enabled in the - * first place. - */ - if (!hugetlb_bootmem_allocated()) - return; - if (!READ_ONCE(vmemmap_optimize_enabled)) return; @@ -847,9 +839,6 @@ void __init hugetlb_vmemmap_init_late(int nid) struct hstate *h; void *map; - if (!hugetlb_bootmem_allocated()) - return; - if (!READ_ONCE(vmemmap_optimize_enabled)) return; diff --git a/mm/internal.h b/mm/internal.h index aacda4f79534..aee1f72ef6ed 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -171,7 +171,7 @@ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) /* * OK, we tried to call the file hook for mmap(), but an error - * arose. The mapping is in an inconsistent state and we most not invoke + * arose. The mapping is in an inconsistent state and we must not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; @@ -199,6 +199,73 @@ static inline void vma_close(struct vm_area_struct *vma) #ifdef CONFIG_MMU +static inline void get_anon_vma(struct anon_vma *anon_vma) +{ + atomic_inc(&anon_vma->refcount); +} + +void __put_anon_vma(struct anon_vma *anon_vma); + +static inline void put_anon_vma(struct anon_vma *anon_vma) +{ + if (atomic_dec_and_test(&anon_vma->refcount)) + __put_anon_vma(anon_vma); +} + +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) +{ + down_write(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) +{ + up_write(&anon_vma->root->rwsem); +} + +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) +{ + return down_read_trylock(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + +struct anon_vma *folio_get_anon_vma(const struct folio *folio); + +/* Operations which modify VMAs. */ +enum vma_operation { + VMA_OP_SPLIT, + VMA_OP_MERGE_UNFAULTED, + VMA_OP_REMAP, + VMA_OP_FORK, +}; + +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation); +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma); +int __anon_vma_prepare(struct vm_area_struct *vma); +void unlink_anon_vmas(struct vm_area_struct *vma); + +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; @@ -513,6 +580,14 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +static inline void set_pages_refcounted(struct page *page, unsigned long nr_pages) +{ + unsigned long pfn = page_to_pfn(page); + + for (; nr_pages--; pfn++) + set_page_refcounted(pfn_to_page(pfn)); +} + /* * Return true if a folio needs ->release_folio() calling upon it. */ @@ -853,6 +928,12 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int, bool); +#ifdef CONFIG_SPARSEMEM +void sparse_init(void); +#else +static inline void sparse_init(void) {} +#endif /* CONFIG_SPARSEMEM */ + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -929,9 +1010,14 @@ void init_cma_reserved_pageblock(struct page *page); struct cma; #ifdef CONFIG_CMA +bool cma_validate_zones(struct cma *cma); void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); #else +static inline bool cma_validate_zones(struct cma *cma) +{ + return false; +} static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { return NULL; @@ -1658,24 +1744,6 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); -/* pt_reclaim.c */ -bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); -void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, - pmd_t pmdval); -void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, - struct mmu_gather *tlb); - -#ifdef CONFIG_PT_RECLAIM -bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, - struct zap_details *details); -#else -static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, - struct zap_details *details) -{ - return false; -} -#endif /* CONFIG_PT_RECLAIM */ - void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 2cafca31b092..b4d157962121 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1840,6 +1840,29 @@ static void vmalloc_helpers_tags(struct kunit *test) vfree(ptr); } +static void vmalloc_oob_helper(struct kunit *test, char *v_ptr, size_t size) +{ + /* + * We have to be careful not to hit the guard page in vmalloc tests. + * The MMU will catch that and crash us. + */ + + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + size = round_up(size, KASAN_GRANULE_SIZE); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size]); +} + static void vmalloc_oob(struct kunit *test) { char *v_ptr, *p_ptr; @@ -1856,24 +1879,21 @@ static void vmalloc_oob(struct kunit *test) OPTIMIZER_HIDE_VAR(v_ptr); - /* - * We have to be careful not to hit the guard page in vmalloc tests. - * The MMU will catch that and crash us. - */ + vmalloc_oob_helper(test, v_ptr, size); - /* Make sure in-bounds accesses are valid. */ - v_ptr[0] = 0; - v_ptr[size - 1] = 0; + size -= KASAN_GRANULE_SIZE + 1; + v_ptr = vrealloc(v_ptr, size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); - /* - * An unaligned access past the requested vmalloc size. - * Only generic KASAN can precisely detect these. - */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC)) - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + OPTIMIZER_HIDE_VAR(v_ptr); - /* An aligned access into the first out-of-bounds granule. */ - KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]); + vmalloc_oob_helper(test, v_ptr, size); + + size += 2 * KASAN_GRANULE_SIZE + 2; + v_ptr = vrealloc(v_ptr, size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + vmalloc_oob_helper(test, v_ptr, size); /* Check that in-bounds accesses to the physical page are valid. */ page = vmalloc_to_page(v_ptr); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 62c01b4527eb..27efb78eb32d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -203,7 +203,7 @@ static inline void fail_non_kasan_kunit_test(void) { } static DEFINE_RAW_SPINLOCK(report_lock); -static void start_report(unsigned long *flags, bool sync) +static void start_report(unsigned long *flags) { fail_non_kasan_kunit_test(); /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ @@ -543,7 +543,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty if (unlikely(!report_enabled())) return; - start_report(&flags, true); + start_report(&flags); __memset(&info, 0, sizeof(info)); info.type = type; @@ -581,7 +581,7 @@ bool kasan_report(const void *addr, size_t size, bool is_write, goto out; } - start_report(&irq_flags, true); + start_report(&irq_flags); __memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; @@ -615,7 +615,7 @@ void kasan_report_async(void) if (unlikely(!report_enabled())) return; - start_report(&flags, false); + start_report(&flags); pr_err("BUG: KASAN: invalid-access\n"); pr_err("Asynchronous fault: no details available\n"); pr_err("\n"); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 32fbdf759ea2..d286e0a04543 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,7 +305,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); index = PFN_DOWN(addr - data->start); page = data->pages[index]; @@ -319,7 +319,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } @@ -471,7 +471,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int none; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_pause(); spin_lock(&init_mm.page_table_lock); pte = ptep_get(ptep); @@ -483,7 +483,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (likely(!none)) __free_page(pfn_to_page(pte_pfn(pte))); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_resume(); return 0; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 653e162fa494..b4ea3262c925 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -905,7 +905,7 @@ static void toggle_allocation_gate(struct work_struct *work) /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - queue_delayed_work(system_unbound_wq, &kfence_timer, + queue_delayed_work(system_dfl_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } @@ -955,7 +955,7 @@ static void kfence_init_enable(void) #endif WRITE_ONCE(kfence_enabled, true); - queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + queue_delayed_work(system_dfl_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, @@ -1051,7 +1051,7 @@ static int kfence_enable_late(void) return kfence_init_late(); WRITE_ONCE(kfence_enabled, true); - queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + queue_delayed_work(system_dfl_wq, &kfence_timer, 0); pr_info("re-enabled\n"); return 0; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 97d1b2824386..1b8faae5b448 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -22,6 +22,7 @@ #include <linux/dax.h> #include <linux/ksm.h> #include <linux/pgalloc.h> +#include <linux/backing-dev.h> #include <asm/tlb.h> #include "internal.h" @@ -58,6 +59,7 @@ enum scan_result { SCAN_STORE_FAILED, SCAN_COPY_MC, SCAN_PAGE_FILLED, + SCAN_PAGE_DIRTY_OR_WRITEBACK, }; #define CREATE_TRACE_POINTS @@ -535,17 +537,16 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long start_addr, - pte_t *pte, - struct collapse_control *cc, - struct list_head *compound_pagelist) +static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long start_addr, pte_t *pte, struct collapse_control *cc, + struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; unsigned long addr = start_addr; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; + int none_or_zero = 0, shared = 0, referenced = 0; + enum scan_result result = SCAN_FAIL; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { @@ -778,13 +779,13 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * @ptl: lock on raw pages' PTEs * @compound_pagelist: list that stores compound pages */ -static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, +static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, unsigned long address, spinlock_t *ptl, struct list_head *compound_pagelist) { unsigned int i; - int result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; /* * Copying pages' contents is subject to memory poison at any iteration. @@ -826,7 +827,7 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -struct collapse_control khugepaged_collapse_control = { +static struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; @@ -896,10 +897,8 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) * Returns enum scan_result value. */ -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - bool expect_anon, - struct vm_area_struct **vmap, - struct collapse_control *cc) +static enum scan_result hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { struct vm_area_struct *vma; enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : @@ -928,7 +927,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static inline int check_pmd_state(pmd_t *pmd) +static inline enum scan_result check_pmd_state(pmd_t *pmd) { pmd_t pmde = pmdp_get_lockless(pmd); @@ -951,9 +950,8 @@ static inline int check_pmd_state(pmd_t *pmd) return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static enum scan_result find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, pmd_t **pmd) { *pmd = mm_find_pmd(mm, address); if (!*pmd) @@ -962,12 +960,11 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return check_pmd_state(*pmd); } -static int check_pmd_still_valid(struct mm_struct *mm, - unsigned long address, - pmd_t *pmd) +static enum scan_result check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, pmd_t *pmd) { pmd_t *new_pmd; - int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + enum scan_result result = find_pmd_or_thp_or_none(mm, address, &new_pmd); if (result != SCAN_SUCCEED) return result; @@ -983,15 +980,14 @@ static int check_pmd_still_valid(struct mm_struct *mm, * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ -static int __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start_addr, pmd_t *pmd, - int referenced) +static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); - int result; + enum scan_result result; pte_t *pte = NULL; spinlock_t *ptl; @@ -1060,8 +1056,8 @@ out: return result; } -static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, - struct collapse_control *cc) +static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, + struct collapse_control *cc) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); @@ -1088,9 +1084,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, return SCAN_SUCCEED; } -static int collapse_huge_page(struct mm_struct *mm, unsigned long address, - int referenced, int unmapped, - struct collapse_control *cc) +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -1098,7 +1093,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; - int result = SCAN_FAIL; + enum scan_result result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; @@ -1244,15 +1239,14 @@ out_nolock: return result; } -static int hpage_collapse_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start_addr, bool *mmap_locked, - struct collapse_control *cc) +static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int result = SCAN_FAIL, referenced = 0; - int none_or_zero = 0, shared = 0; + int none_or_zero = 0, shared = 0, referenced = 0; + enum scan_result result = SCAN_FAIL; struct page *page = NULL; struct folio *folio = NULL; unsigned long addr; @@ -1439,8 +1433,8 @@ static void collect_mm_slot(struct mm_slot *slot) } /* folio must be locked, and mmap_lock must be held */ -static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct folio *folio, struct page *page) +static enum scan_result set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct folio *folio, struct page *page) { struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { @@ -1475,22 +1469,11 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } -/** - * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at - * address haddr. - * - * @mm: process address space where collapse happens - * @addr: THP collapse address - * @install_pmd: If a huge PMD should be installed - * - * This function checks whether all the PTEs in the PMD are pointing to the - * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. Possibly install a huge PMD mapping the THP. - */ -int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, - bool install_pmd) +static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { - int nr_mapped_ptes = 0, result = SCAN_FAIL; + enum scan_result result = SCAN_FAIL; + int nr_mapped_ptes = 0; unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; @@ -1709,6 +1692,24 @@ drop_folio: return result; } +/** + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. Possibly install a huge PMD mapping the THP. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) +{ + try_collapse_pte_mapped_thp(mm, addr, install_pmd); +} + /* Can we retract page tables for this file-backed VMA? */ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) { @@ -1854,9 +1855,8 @@ drop_pml: * + unlock old pages * + unlock and free huge page; */ -static int collapse_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) +static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *dst; @@ -1864,7 +1864,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); - int nr_none = 0, result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; + int nr_none = 0; bool is_shmem = shmem_file(file); VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); @@ -1967,11 +1968,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, */ xas_unlock_irq(&xas); filemap_flush(mapping); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_test_writeback(folio)) { xas_unlock_irq(&xas); - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto xa_unlocked; } else if (folio_trylock(folio)) { folio_get(folio); @@ -2018,7 +2019,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * folio is dirty because it hasn't been flushed * since first write. */ - result = SCAN_FAIL; + result = SCAN_PAGE_DIRTY_OR_WRITEBACK; goto out_unlock; } @@ -2194,16 +2195,13 @@ immap_locked: xas_lock_irq(&xas); } - if (is_shmem) + if (is_shmem) { + lruvec_stat_mod_folio(new_folio, NR_SHMEM, HPAGE_PMD_NR); lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); - else + } else { lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); - - if (nr_none) { - lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); - /* nr_none is always 0 for non-shmem. */ - lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, HPAGE_PMD_NR); /* * Mark new_folio as uptodate before inserting it into the @@ -2225,7 +2223,7 @@ immap_locked: /* * Remove pte page tables, so we can re-fault the page as huge. - * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). + * If MADV_COLLAPSE, adjust result to call try_collapse_pte_mapped_thp(). */ retract_page_tables(mapping, start); if (cc && !cc->is_khugepaged) @@ -2237,6 +2235,11 @@ immap_locked: */ list_for_each_entry_safe(folio, tmp, &pagelist, lru) { list_del(&folio->lru); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, + -folio_nr_pages(folio)); + if (is_shmem) + lruvec_stat_mod_folio(folio, NR_SHMEM, + -folio_nr_pages(folio)); folio->mapping = NULL; folio_clear_active(folio); folio_clear_unevictable(folio); @@ -2285,16 +2288,15 @@ out: return result; } -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) +static enum scan_result hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; - int result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; present = 0; swap = 0; @@ -2392,7 +2394,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, return result; } -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, enum scan_result *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) @@ -2440,14 +2442,15 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, break; } if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { -skip: progress++; continue; } hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); - if (khugepaged_scan.address > hend) - goto skip; + if (khugepaged_scan.address > hend) { + progress++; + continue; + } if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); @@ -2476,7 +2479,7 @@ skip: mmap_read_lock(mm); if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; - *result = collapse_pte_mapped_thp(mm, + *result = try_collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); if (*result == SCAN_PMD_MAPPED) *result = SCAN_SUCCEED; @@ -2552,7 +2555,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; - int result = SCAN_SUCCEED; + enum scan_result result = SCAN_SUCCEED; lru_add_drain_all(); @@ -2747,6 +2750,7 @@ static int madvise_collapse_errno(enum scan_result r) case SCAN_PAGE_LRU: case SCAN_DEL_PAGE_LRU: case SCAN_PAGE_FILLED: + case SCAN_PAGE_DIRTY_OR_WRITEBACK: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to @@ -2764,7 +2768,8 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; unsigned long hstart, hend, addr; - int thps = 0, last_fail = SCAN_FAIL; + enum scan_result last_fail = SCAN_FAIL; + int thps = 0; bool mmap_locked = true; BUG_ON(vma->vm_start > start); @@ -2785,8 +2790,10 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, hend = end & HPAGE_PMD_MASK; for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { - int result = SCAN_FAIL; + enum scan_result result = SCAN_FAIL; + bool triggered_wb = false; +retry: if (!mmap_locked) { cond_resched(); mmap_read_lock(mm); @@ -2807,8 +2814,20 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, mmap_read_unlock(mm); mmap_locked = false; + *lock_dropped = true; result = hpage_collapse_scan_file(mm, addr, file, pgoff, cc); + + if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb && + mapping_can_writeback(file->f_mapping)) { + loff_t lstart = (loff_t)pgoff << PAGE_SHIFT; + loff_t lend = lstart + HPAGE_PMD_SIZE - 1; + + filemap_write_and_wait_range(file->f_mapping, lstart, lend); + triggered_wb = true; + fput(file); + goto retry; + } fput(file); } else { result = hpage_collapse_scan_pmd(mm, vma, addr, @@ -2826,7 +2845,7 @@ handle_result: case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); mmap_read_lock(mm); - result = collapse_pte_mapped_thp(mm, addr, true); + result = try_collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 902ec48b1e3e..81e642db6e23 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -361,7 +361,7 @@ static void test_init_vmalloc(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } -/* Test case: ensure that use-after-free reporting works. */ +/* Test case: ensure that use-after-free reporting works for kmalloc. */ static void test_uaf(struct kunit *test) { EXPECTATION_USE_AFTER_FREE(expect); @@ -378,6 +378,65 @@ static void test_uaf(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +static void test_uninit_page(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + struct page *page; + int *ptr; + + kunit_info(test, "uninitialized page allocation (UMR report)\n"); + page = alloc_pages(GFP_KERNEL, 0); + ptr = page_address(page); + USE(*ptr); + __free_pages(page, 0); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static volatile char *test_uaf_pages_helper(int order, int offset) +{ + struct page *page; + volatile char *var; + + /* Memory is initialized up until __free_pages() thanks to __GFP_ZERO. */ + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + var = page_address(page) + offset; + __free_pages(page, order); + + return var; +} + +/* Test case: ensure that use-after-free reporting works for a freed page. */ +static void test_uaf_pages(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile char value; + + kunit_info(test, "use-after-free on a freed page (UMR report)\n"); + /* Allocate a single page, free it, then try to access it. */ + value = *test_uaf_pages_helper(0, 3); + USE(value); + + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that UAF reporting works for high order pages. */ +static void test_uaf_high_order_pages(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile char value; + + kunit_info(test, + "use-after-free on a freed high-order page (UMR report)\n"); + /* + * Create a high-order non-compound page, free it, then try to access + * its tail page. + */ + value = *test_uaf_pages_helper(1, PAGE_SIZE + 3); + USE(value); + + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + /* * Test case: ensure that uninitialized values are propagated through per-CPU * memory. @@ -682,7 +741,10 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_uninit_kmsan_check_memory), KUNIT_CASE(test_init_kmsan_vmap_vunmap), KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uninit_page), KUNIT_CASE(test_uaf), + KUNIT_CASE(test_uaf_pages), + KUNIT_CASE(test_uaf_high_order_pages), KUNIT_CASE(test_percpu_propagate), KUNIT_CASE(test_printk), KUNIT_CASE(test_init_memcpy), diff --git a/mm/list_lru.c b/mm/list_lru.c index ec48b5dadf51..13b9f66d950e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -187,7 +187,7 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) if (list_lru_memcg_aware(lru)) { rcu_read_lock(); - ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item)); + ret = list_lru_add(lru, item, nid, mem_cgroup_from_virt(item)); rcu_read_unlock(); } else { ret = list_lru_add(lru, item, nid, NULL); @@ -224,7 +224,7 @@ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) if (list_lru_memcg_aware(lru)) { rcu_read_lock(); - ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item)); + ret = list_lru_del(lru, item, nid, mem_cgroup_from_virt(item)); rcu_read_unlock(); } else { ret = list_lru_del(lru, item, nid, NULL); @@ -369,7 +369,7 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, xa_for_each(&lru->xa, index, mlru) { rcu_read_lock(); - memcg = mem_cgroup_from_id(index); + memcg = mem_cgroup_from_private_id(index); if (!mem_cgroup_tryget(memcg)) { rcu_read_unlock(); continue; diff --git a/mm/madvise.c b/mm/madvise.c index b617b1be0f53..1f3040688f04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -109,9 +109,7 @@ void anon_vma_name_free(struct kref *kref) struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) - vma_assert_locked(vma); - + vma_assert_stabilised(vma); return vma->anon_name; } @@ -453,7 +451,7 @@ restart: if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { nr = 1; ptent = ptep_get(pte); @@ -461,7 +459,7 @@ restart: if (++batch_count == SWAP_CLUSTER_MAX) { batch_count = 0; if (need_resched()) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); cond_resched(); goto restart; @@ -497,7 +495,7 @@ restart: if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -508,7 +506,7 @@ restart: if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -556,7 +554,7 @@ restart: } if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } if (pageout) @@ -675,7 +673,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) return 0; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { nr = 1; ptent = ptep_get(pte); @@ -694,7 +692,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (softleaf_is_hwpoison(entry) || softleaf_is_poison_marker(entry)) { @@ -724,7 +722,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!folio_trylock(folio)) continue; folio_get(folio); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); start_pte = NULL; err = split_folio(folio); @@ -735,7 +733,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) break; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (!err) nr = 0; continue; @@ -775,7 +773,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); if (start_pte) { - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(start_pte, ptl); } cond_resched(); @@ -1867,7 +1865,7 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) * madvise_should_skip() - Return if the request is invalid or nothing. * @start: Start address of madvise-requested address range. * @len_in: Length of madvise-requested address range. - * @behavior: Requested madvise behavor. + * @behavior: Requested madvise behavior. * @err: Pointer to store an error code from the check. * * If the specified behaviour is invalid or nothing would occur, we skip the diff --git a/mm/memblock.c b/mm/memblock.c index 905d06b16348..e76255e4ff36 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -773,7 +773,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt unsigned long start_pfn, end_pfn, mem_size_mb; int nid, i; - /* calculate lose page */ + /* calculate lost page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; @@ -2414,7 +2414,7 @@ EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); /** * reserve_mem_release_by_name - Release reserved memory region with a given name - * @name: The name that is attatched to a reserved memory region + * @name: The name that is attached to a reserved memory region * * Forcibly release the pages in the reserved memory region so that those memory * can be used as free memory. After released the reserved region size becomes 0. diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 6eed14bff742..0e3d972fad33 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -427,6 +427,28 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +{ + unsigned long val; + + if (mem_cgroup_is_root(memcg)) { + /* + * Approximate root's usage from global state. This isn't + * perfect, but the root usage was always an approximation. + */ + val = global_node_page_state(NR_FILE_PAGES) + + global_node_page_state(NR_ANON_MAPPED); + if (swap) + val += total_swap_pages - get_nr_swap_pages(); + } else { + if (!swap) + val = page_counter_read(&memcg->memory); + else + val = page_counter_read(&memcg->memsw); + } + return val; +} + static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; @@ -613,14 +635,14 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge. */ - swap_memcg = mem_cgroup_id_get_online(memcg); + swap_memcg = mem_cgroup_private_id_get_online(memcg); nr_entries = folio_nr_pages(folio); /* Get references for the tail pages, too */ if (nr_entries > 1) - mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + mem_cgroup_private_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index a304ad418cdf..eb3c3c105657 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -22,15 +22,13 @@ iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); - void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); int memory_stat_show(struct seq_file *m, void *v); -void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); -struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg); +void mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n); +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36ab9897b61b..b730233a481d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -646,7 +646,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * in latency-sensitive paths is as cheap as possible. */ __mem_cgroup_flush_stats(root_mem_cgroup, true); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); + queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) @@ -816,7 +816,7 @@ void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) struct lruvec *lruvec; rcu_read_lock(); - memcg = mem_cgroup_from_slab_obj(p); + memcg = mem_cgroup_from_virt(p); /* * Untracked pages have no memcg, no lruvec. Update only the @@ -1639,11 +1639,6 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } -unsigned long mem_cgroup_size(struct mem_cgroup *memcg) -{ - return page_counter_read(&memcg->memory); -} - void __memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event, bool allow_spinning) { @@ -2658,7 +2653,7 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ -struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +struct mem_cgroup *mem_cgroup_from_virt(void *p) { struct slab *slab; @@ -3320,28 +3315,6 @@ void folio_split_memcg_refs(struct folio *folio, unsigned old_order, css_get_many(&__folio_memcg(folio)->css, new_refs); } -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - unsigned long val; - - if (mem_cgroup_is_root(memcg)) { - /* - * Approximate root's usage from global state. This isn't - * perfect, but the root usage was always an approximation. - */ - val = global_node_page_state(NR_FILE_PAGES) + - global_node_page_state(NR_ANON_MAPPED); - if (swap) - val += total_swap_pages - get_nr_swap_pages(); - } else { - if (!swap) - val = page_counter_read(&memcg->memory); - else - val = page_counter_read(&memcg->memsw); - } - return val; -} - static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; @@ -3629,38 +3602,38 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) */ #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) -static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); +static DEFINE_XARRAY_ALLOC1(mem_cgroup_private_ids); -static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +static void mem_cgroup_private_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { - xa_erase(&mem_cgroup_ids, memcg->id.id); + xa_erase(&mem_cgroup_private_ids, memcg->id.id); memcg->id.id = 0; } } -void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, +void __maybe_unused mem_cgroup_private_id_get_many(struct mem_cgroup *memcg, unsigned int n) { refcount_add(n, &memcg->id.ref); } -static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) +static void mem_cgroup_private_id_put_many(struct mem_cgroup *memcg, unsigned int n) { if (refcount_sub_and_test(n, &memcg->id.ref)) { - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); /* Memcg ID pins CSS */ css_put(&memcg->css); } } -static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +static inline void mem_cgroup_private_id_put(struct mem_cgroup *memcg) { - mem_cgroup_id_put_many(memcg, 1); + mem_cgroup_private_id_put_many(memcg, 1); } -struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { /* @@ -3679,39 +3652,35 @@ struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) } /** - * mem_cgroup_from_id - look up a memcg from a memcg id + * mem_cgroup_from_private_id - look up a memcg from a memcg id * @id: the memcg id to look up * * Caller must hold rcu_read_lock(). */ -struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +struct mem_cgroup *mem_cgroup_from_private_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); - return xa_load(&mem_cgroup_ids, id); + return xa_load(&mem_cgroup_private_ids, id); } -#ifdef CONFIG_SHRINKER_DEBUG -struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +struct mem_cgroup *mem_cgroup_get_from_id(u64 id) { struct cgroup *cgrp; struct cgroup_subsys_state *css; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; - cgrp = cgroup_get_from_id(ino); + cgrp = cgroup_get_from_id(id); if (IS_ERR(cgrp)) - return ERR_CAST(cgrp); + return NULL; css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); if (css) memcg = container_of(css, struct mem_cgroup, css); - else - memcg = ERR_PTR(-ENOENT); cgroup_put(cgrp); return memcg; } -#endif static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { @@ -3786,7 +3755,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg) return ERR_PTR(-ENOMEM); - error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, + error = xa_alloc(&mem_cgroup_private_ids, &memcg->id.id, NULL, XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); if (error) goto fail; @@ -3846,7 +3815,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) lru_gen_init_memcg(memcg); return memcg; fail: - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); __mem_cgroup_free(memcg); return ERR_PTR(error); } @@ -3920,7 +3889,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) goto offline_kmem; if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + queue_delayed_work(system_dfl_wq, &stats_flush_dwork, FLUSH_TIME); lru_gen_online_memcg(memcg); @@ -3929,7 +3898,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) css_get(css); /* - * Ensure mem_cgroup_from_id() works once we're fully online. + * Ensure mem_cgroup_from_private_id() works once we're fully online. * * We could do this earlier and require callers to filter with * css_tryget_online(). But right now there are no users that @@ -3938,13 +3907,13 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ - xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); + xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; offline_kmem: memcg_offline_kmem(memcg); remove_id: - mem_cgroup_id_remove(memcg); + mem_cgroup_private_id_remove(memcg); return -ENOMEM; } @@ -3967,7 +3936,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) drain_all_stock(memcg); - mem_cgroup_id_put(memcg); + mem_cgroup_private_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4854,7 +4823,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, id = lookup_swap_cgroup_id(entry); rcu_read_lock(); - memcg = mem_cgroup_from_id(id); + memcg = mem_cgroup_from_private_id(id); if (!memcg || !css_tryget_online(&memcg->css)) memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); @@ -5051,7 +5020,7 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) memcg = folio_memcg(old); /* * Note that it is normal to see !memcg for a hugetlb folio. - * For e.g, itt could have been allocated when memory_hugetlb_accounting + * For e.g, it could have been allocated when memory_hugetlb_accounting * was not selected. */ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); @@ -5257,22 +5226,22 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) return 0; } - memcg = mem_cgroup_id_get_online(memcg); + memcg = mem_cgroup_private_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - mem_cgroup_id_put(memcg); + mem_cgroup_private_id_put(memcg); return -ENOMEM; } /* Get references for the tail pages, too */ if (nr_pages > 1) - mem_cgroup_id_get_many(memcg, nr_pages - 1); + mem_cgroup_private_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_private_id(memcg), entry); return 0; } @@ -5289,7 +5258,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); - memcg = mem_cgroup_from_id(id); + memcg = mem_cgroup_from_private_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (do_memsw_account()) @@ -5298,7 +5267,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); - mem_cgroup_id_put_many(memcg, nr_pages); + mem_cgroup_private_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/memfd.c b/mm/memfd.c index f032c6052926..82a3f38aa30a 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * memfd_create system call and file sealing support * * Code was originally included in shmem.c, and broken out to facilitate * use by hugetlbfs as well as tmpfs. - * - * This file is released under the GPL. */ #include <linux/fs.h> diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9fd8355176eb..ba4231858a36 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -868,7 +868,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, * * MF_RECOVERED - The m-f() handler marks the page as PG_hwpoisoned'ed. * The page has been completely isolated, that is, unmapped, taken out of - * the buddy system, or hole-punnched out of the file mapping. + * the buddy system, or hole-punched out of the file mapping. */ static const char *action_name[] = { [MF_IGNORED] = "Ignored", diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 864811fff409..0ae8bec86346 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -475,8 +475,7 @@ static void establish_demotion_targets(void) */ list_for_each_entry_reverse(memtier, &memory_tiers, list) { tier_nodes = get_memtier_nodemask(memtier); - nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); - if (!nodes_empty(tier_nodes)) { + if (nodes_and(tier_nodes, node_states[N_CPU], tier_nodes)) { /* * abstract distance below the max value of this memtier * is considered toptier. @@ -648,7 +647,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) if (node_memory_types[node].memtype == memtype || !memtype) node_memory_types[node].map_count--; /* - * If we umapped all the attached devices to this node, + * If we unmapped all the attached devices to this node, * clear the node memory type. */ if (!node_memory_types[node].map_count) { @@ -956,7 +955,7 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, struct pglist_data *pgdat; for_each_online_pgdat(pgdat) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); } return count; diff --git a/mm/memory.c b/mm/memory.c index b2909c94e249..b0d487229b2e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *page; if (likely(softleaf_is_swap(entry))) { - if (swap_duplicate(entry) < 0) + if (swap_dup_entry_direct(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ @@ -1256,7 +1256,7 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr = 1; @@ -1325,7 +1325,7 @@ again: } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); @@ -1748,7 +1748,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); } else if (softleaf_is_migration(entry)) { struct folio *folio = softleaf_to_folio(entry); @@ -1821,11 +1821,70 @@ static inline int do_zap_pte_range(struct mmu_gather *tlb, return nr; } +static bool pte_table_reclaim_possible(unsigned long start, unsigned long end, + struct zap_details *details) +{ + if (!IS_ENABLED(CONFIG_PT_RECLAIM)) + return false; + /* Only zap if we are allowed to and cover the full page table. */ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +static bool zap_empty_pte_table(struct mm_struct *mm, pmd_t *pmd, + spinlock_t *ptl, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (ptl != pml && !spin_trylock(pml)) + return false; + + *pmdval = pmdp_get(pmd); + pmd_clear(pmd); + if (ptl != pml) + spin_unlock(pml); + return true; +} + +static bool zap_pte_table_if_empty(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdval) +{ + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + return true; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); + return false; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { + bool can_reclaim_pt = pte_table_reclaim_possible(addr, end, details); bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; @@ -1834,7 +1893,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *pte; pmd_t pmdval; unsigned long start = addr; - bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); bool direct_reclaim = true; int nr; @@ -1846,7 +1904,7 @@ retry: return addr; flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { bool any_skipped = false; @@ -1875,10 +1933,10 @@ retry: * from being repopulated by another thread. */ if (can_reclaim_pt && direct_reclaim && addr == end) - direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + direct_reclaim = zap_empty_pte_table(mm, pmd, ptl, &pmdval); add_mm_rss_vec(mm, rss); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { @@ -1904,10 +1962,10 @@ retry: } if (can_reclaim_pt) { - if (direct_reclaim) - free_pte(mm, start, tlb, pmdval); - else - try_to_free_pte(mm, pmd, start, tlb); + if (direct_reclaim || zap_pte_table_if_empty(mm, pmd, start, &pmdval)) { + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); + } } return addr; @@ -2499,7 +2557,6 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, { unsigned long count = vma_pages(vma); unsigned long uaddr = vma->vm_start; - int ret, i; /* Fail if the user requested offset is beyond the end of the object */ if (offset >= num) @@ -2509,14 +2566,7 @@ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, if (count > num - offset) return -ENXIO; - for (i = 0; i < count; i++) { - ret = vm_insert_page(vma, uaddr, pages[offset + i]); - if (ret < 0) - return ret; - uaddr += PAGE_SIZE; - } - - return 0; + return vm_insert_pages(vma, uaddr, pages + offset, &count); } /** @@ -2816,7 +2866,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { @@ -2826,7 +2876,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -3177,7 +3227,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, return -EINVAL; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); if (fn) { do { @@ -3190,7 +3240,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } *mask |= PGTBL_PTE_MODIFIED; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); @@ -4357,12 +4407,27 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } -static inline bool should_try_to_free_swap(struct folio *folio, +/* + * Check if we should call folio_free_swap to free the swap cache. + * folio_free_swap only frees the swap cache to release the slot if swap + * count is zero, so we don't need to check the swap count here. + */ +static inline bool should_try_to_free_swap(struct swap_info_struct *si, + struct folio *folio, struct vm_area_struct *vma, + unsigned int extra_refs, unsigned int fault_flags) { if (!folio_test_swapcache(folio)) return false; + /* + * Always try to free swap cache for SWP_SYNCHRONOUS_IO devices. Swap + * cache can help save some IO or memory overhead, but these devices + * are fast, and meanwhile, swap cache pinning the slot deferring the + * release of metadata or fragmentation is a more critical issue. + */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) + return true; if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; @@ -4373,7 +4438,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * reference only in case it's likely that we'll be the exclusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && - folio_ref_count(folio) == (1 + folio_nr_pages(folio)); + folio_ref_count(folio) == (extra_refs + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -4611,7 +4676,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); +/* Sanity check that a folio is fully exclusive */ +static void check_swap_exclusive(struct folio *folio, swp_entry_t entry, + unsigned int nr_pages) +{ + /* Called under PT locked and folio locked, the swap count is stable */ + do { + VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio); + entry.val++; + } while (--nr_pages); +} /* * We enter with non-exclusive mmap_lock (to exclude vma changes, @@ -4624,17 +4698,14 @@ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct folio *swapcache, *folio = NULL; - DECLARE_WAITQUEUE(wait, current); + struct folio *swapcache = NULL, *folio; struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; - bool need_clear_cache = false; bool exclusive = false; softleaf_t entry; pte_t pte; vm_fault_t ret = 0; - void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; @@ -4705,57 +4776,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio = swap_cache_get_folio(entry); if (folio) swap_update_readahead(folio, vma, vmf->address); - swapcache = folio; - if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && - __swap_count(entry) == 1) { - /* skip swapcache */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { folio = alloc_swap_folio(vmf); if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - nr_pages = folio_nr_pages(folio); - if (folio_test_large(folio)) - entry.val = ALIGN_DOWN(entry.val, nr_pages); /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread - * may finish swapin first, free the entry, and - * swapout reusing the same entry. It's - * undetectable as pte_same() returns true due - * to entry reuse. + * folio is charged, so swapin can only fail due + * to raced swapin and return NULL. */ - if (swapcache_prepare(entry, nr_pages)) { - /* - * Relax a bit to prevent rapid - * repeated page faults. - */ - add_wait_queue(&swapcache_wq, &wait); - schedule_timeout_uninterruptible(1); - remove_wait_queue(&swapcache_wq, &wait); - goto out_page; - } - need_clear_cache = true; - - memcg1_swapin(entry, nr_pages); - - shadow = swap_cache_get_shadow(entry); - if (shadow) - workingset_refault(folio, shadow); - - folio_add_lru(folio); - - /* To provide entry to swap_read_folio() */ - folio->swap = entry; - swap_read_folio(folio, NULL); - folio->private = NULL; + swapcache = swapin_folio(entry, folio); + if (swapcache != folio) + folio_put(folio); + folio = swapcache; } } else { - folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, - vmf); - swapcache = folio; + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); } if (!folio) { @@ -4777,60 +4812,58 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } + swapcache = folio; ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; page = folio_file_page(folio, swp_offset(entry)); - if (swapcache) { - /* - * Make sure folio_free_swap() or swapoff did not release the - * swapcache from under us. The page pin, and pte_same test - * below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not - * changed. - */ - if (unlikely(!folio_matches_swap_entry(folio, entry))) - goto out_page; - - if (unlikely(PageHWPoison(page))) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_page; - } - - /* - * KSM sometimes has to copy on read faults, for example, if - * folio->index of non-ksm folios would be nonlinear inside the - * anon VMA -- the ksm flag is lost on actual swapout. - */ - folio = ksm_might_need_to_copy(folio, vma, vmf->address); - if (unlikely(!folio)) { - ret = VM_FAULT_OOM; - folio = swapcache; - goto out_page; - } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { - ret = VM_FAULT_HWPOISON; - folio = swapcache; - goto out_page; - } - if (folio != swapcache) - page = folio_page(folio, 0); + /* + * Make sure folio_free_swap() or swapoff did not release the + * swapcache from under us. The page pin, and pte_same test + * below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not + * changed. + */ + if (unlikely(!folio_matches_swap_entry(folio, entry))) + goto out_page; + if (unlikely(PageHWPoison(page))) { /* - * If we want to map a page that's in the swapcache writable, we - * have to detect via the refcount if we're really the exclusive - * owner. Try removing the extra reference from the local LRU - * caches if required. + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) */ - if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && - !folio_test_ksm(folio) && !folio_test_lru(folio)) - lru_add_drain(); + ret = VM_FAULT_HWPOISON; + goto out_page; } + /* + * KSM sometimes has to copy on read faults, for example, if + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. + */ + folio = ksm_might_need_to_copy(folio, vma, vmf->address); + if (unlikely(!folio)) { + ret = VM_FAULT_OOM; + folio = swapcache; + goto out_page; + } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { + ret = VM_FAULT_HWPOISON; + folio = swapcache; + goto out_page; + } else if (folio != swapcache) + page = folio_page(folio, 0); + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * caches if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !folio_test_ksm(folio) && !folio_test_lru(folio)) + lru_add_drain(); + folio_throttle_swaprate(folio, GFP_KERNEL); /* @@ -4846,24 +4879,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } - /* allocated large folios for SWP_SYNCHRONOUS_IO */ - if (folio_test_large(folio) && !folio_test_swapcache(folio)) { - unsigned long nr = folio_nr_pages(folio); - unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); - unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; - pte_t *folio_ptep = vmf->pte - idx; - pte_t folio_pte = ptep_get(folio_ptep); - - if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || - swap_pte_batch(folio_ptep, nr, folio_pte) != nr) - goto out_nomap; - - page_idx = idx; - address = folio_start; - ptep = folio_ptep; - goto check_folio; - } - nr_pages = 1; page_idx = 0; address = vmf->address; @@ -4908,11 +4923,36 @@ check_folio: BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* + * If a large folio already belongs to anon mapping, then we + * can just go on and map it partially. + * If not, with the large swapin check above failing, the page table + * have changed, so sub pages might got charged to the wrong cgroup, + * or even should be shmem. So we have to free it and fallback. + * Nothing should have touched it, both anon and shmem checks if a + * large folio is fully appliable before use. + * + * This will be removed once we unify folio allocation in the swap cache + * layer, where allocation of a folio stabilizes the swap entries. + */ + if (!folio_test_anon(folio) && folio_test_large(folio) && + nr_pages != folio_nr_pages(folio)) { + if (!WARN_ON_ONCE(folio_test_dirty(folio))) + swap_cache_del_folio(folio); + goto out_nomap; + } + + /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { + /* + * The can_swapin_thp check above ensures all PTE have + * same exclusiveness. Checking just one PTE is fine. + */ exclusive = pte_swp_exclusive(vmf->orig_pte); + if (exclusive) + check_swap_exclusive(folio, entry, nr_pages); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the @@ -4946,19 +4986,10 @@ check_folio: /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); - /* - * Remove the swap entry and conditionally try to free up the swapcache. - * We're already holding a reference on the page but haven't mapped it - * yet. - */ - swap_free_nr(entry, nr_pages); - if (should_try_to_free_swap(folio, vma, vmf->flags)) - folio_free_swap(folio); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); @@ -4990,22 +5021,24 @@ check_folio: vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ - if (unlikely(folio != swapcache && swapcache)) { + if (unlikely(folio != swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + folio_put_swap(swapcache, NULL); } else if (!folio_test_anon(folio)) { /* - * We currently only expect small !anon folios which are either - * fully exclusive or fully shared, or new allocated large - * folios which are fully exclusive. If we ever get large - * folios within swapcache here, we have to be careful. + * We currently only expect !anon folios that are fully + * mappable. See the comment after can_swapin_thp above. */ - VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); - VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio); + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); + folio_put_swap(folio, NULL); } else { + VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio)); folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, - rmap_flags); + rmap_flags); + folio_put_swap(folio, nr_pages == 1 ? page : NULL); } VM_BUG_ON(!folio_test_anon(folio) || @@ -5014,13 +5047,21 @@ check_folio: arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); + /* + * Remove the swap entry and conditionally try to free up the swapcache. + * Do it after mapping, so raced page faults will likely see the folio + * in swap cache and wait on the folio lock. + */ + if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags)) + folio_free_swap(folio); + folio_unlock(folio); - if (folio != swapcache && swapcache) { + if (unlikely(folio != swapcache)) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For - * further safety release the lock after the swap_free + * further safety release the lock after the folio_put_swap * so that the swap count won't change under a * parallel locked swapcache. */ @@ -5041,12 +5082,6 @@ unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: - /* Clear the swap cache pin for direct swapin after PTL unlock */ - if (need_clear_cache) { - swapcache_clear(si, entry, nr_pages); - if (waitqueue_active(&swapcache_wq)) - wake_up(&swapcache_wq); - } if (si) put_swap_device(si); return ret; @@ -5054,18 +5089,15 @@ out_nomap: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: + if (folio_test_swapcache(folio)) + folio_free_swap(folio); folio_unlock(folio); out_release: folio_put(folio); - if (folio != swapcache && swapcache) { + if (folio != swapcache) { folio_unlock(swapcache); folio_put(swapcache); } - if (need_clear_cache) { - swapcache_clear(si, entry, nr_pages); - if (waitqueue_active(&swapcache_wq)) - wake_up(&swapcache_wq); - } if (si) put_swap_device(si); return ret; @@ -5935,7 +5967,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, else *last_cpupid = folio_last_cpupid(folio); - /* Record the current PID acceesing VMA */ + /* Record the current PID accessing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); @@ -6254,7 +6286,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * Use the maywrite version to indicate that vmf->pte may be * modified, but since we will use pte_same() to detect the * change of the !pte_none() entry, there is no need to recheck - * the pmdval. Here we chooes to pass a dummy variable instead + * the pmdval. Here we choose to pass a dummy variable instead * of NULL, which helps new user think about why this place is * special. */ @@ -7240,40 +7272,77 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, - unsigned int nr_pages) +static void clear_contig_highpages(struct page *page, unsigned long addr, + unsigned int nr_pages) { - unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); - int i; + unsigned int i, count; + /* + * When clearing we want to operate on the largest extent possible to + * allow for architecture specific extent based optimizations. + * + * However, since clear_user_highpages() (and primitives clear_user_pages(), + * clear_pages()), do not call cond_resched(), limit the unit size when + * running under non-preemptible scheduling models. + */ + const unsigned int unit = preempt_model_preemptible() ? + nr_pages : PROCESS_PAGES_NON_PREEMPT_BATCH; might_sleep(); - for (i = 0; i < nr_pages; i++) { + + for (i = 0; i < nr_pages; i += count) { cond_resched(); - clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); + + count = min(unit, nr_pages - i); + clear_user_highpages(page + i, addr + i * PAGE_SIZE, count); } } -static int clear_subpage(unsigned long addr, int idx, void *arg) -{ - struct folio *folio = arg; - - clear_user_highpage(folio_page(folio, idx), addr); - return 0; -} +/* + * When zeroing a folio, we want to differentiate between pages in the + * vicinity of the faulting address where we have spatial and temporal + * locality, and those far away where we don't. + * + * Use a radius of 2 for determining the local neighbourhood. + */ +#define FOLIO_ZERO_LOCALITY_RADIUS 2 /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. - * @addr_hint: The address will be accessed or the base address if uncelar. + * @addr_hint: The address accessed by the user or the base address. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned int nr_pages = folio_nr_pages(folio); + const unsigned long base_addr = ALIGN_DOWN(addr_hint, folio_size(folio)); + const long fault_idx = (addr_hint - base_addr) / PAGE_SIZE; + const struct range pg = DEFINE_RANGE(0, folio_nr_pages(folio) - 1); + const int radius = FOLIO_ZERO_LOCALITY_RADIUS; + struct range r[3]; + int i; - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) - clear_gigantic_page(folio, addr_hint, nr_pages); - else - process_huge_page(addr_hint, nr_pages, clear_subpage, folio); + /* + * Faulting page and its immediate neighbourhood. Will be cleared at the + * end to keep its cachelines hot. + */ + r[2] = DEFINE_RANGE(clamp_t(s64, fault_idx - radius, pg.start, pg.end), + clamp_t(s64, fault_idx + radius, pg.start, pg.end)); + + /* Region to the left of the fault */ + r[1] = DEFINE_RANGE(pg.start, + clamp_t(s64, r[2].start - 1, pg.start - 1, r[2].start)); + + /* Region to the right of the fault: always valid for the common fault_idx=0 case. */ + r[0] = DEFINE_RANGE(clamp_t(s64, r[2].end + 1, r[2].end, pg.end + 1), + pg.end); + + for (i = 0; i < ARRAY_SIZE(r); i++) { + const unsigned long addr = base_addr + r[i].start * PAGE_SIZE; + const unsigned int nr_pages = range_len(&r[i]); + struct page *page = folio_page(folio, r[i].start); + + if (nr_pages > 0) + clear_contig_highpages(page, addr, nr_pages); + } } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a63ec679d861..bc805029da51 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -926,7 +926,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * * MOVABLE : KERNEL_EARLY * - * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze + * Whereby KERNEL_EARLY is memory in one of the kernel zones, available since * boot. We base our calculation on KERNEL_EARLY internally, because: * * a) Hotplugged memory in one of the kernel zones can sometimes still get @@ -946,8 +946,8 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the - * managed pages when inflating/deflating the balloon, and balloon compaction - * can even migrate inflated pages between zones. + * managed pages when inflating/deflating the balloon, and balloon page + * migration can even migrate inflated pages between zones. * * Using "present pages" is better but some things to keep in mind are: * @@ -1258,7 +1258,7 @@ static pg_data_t *hotadd_init_pgdat(int nid) * NODE_DATA is preallocated (free_area_init) but its internal * state is not allocated completely. Add missing pieces. * Completely offline nodes stay around and they just need - * reintialization. + * reinitialization. */ pgdat = NODE_DATA(nid); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68a98ba57882..dbd48502ac24 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -365,7 +365,7 @@ static const struct mempolicy_operations { static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & MPOL_MODE_FLAGS; + return pol->flags & MPOL_USER_NODEMASK_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, @@ -1909,8 +1909,7 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, } task_nodes = cpuset_mems_allowed(current); - nodes_and(*new, *new, task_nodes); - if (nodes_empty(*new)) + if (!nodes_and(*new, *new, task_nodes)) goto out_put; err = security_task_movememory(task); diff --git a/mm/migrate.c b/mm/migrate.c index 4688b9e38cd2..1bf2cf8c44dd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -88,7 +88,7 @@ static const struct movable_operations *page_movable_ops(struct page *page) * back to the buddy. */ if (PageOffline(page)) - /* Only balloon compaction sets PageOffline pages movable. */ + /* Only balloon page migration sets PageOffline pages movable. */ return offline_movable_ops; if (PageZsmalloc(page)) return zsmalloc_movable_ops; @@ -452,11 +452,12 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) +void remove_migration_ptes(struct folio *src, struct folio *dst, + enum ttu_flags flags) { struct rmap_walk_arg rmap_walk_arg = { .folio = src, - .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, + .map_unused_to_zeropage = flags & TTU_USE_SHARED_ZEROPAGE, }; struct rmap_walk_control rwc = { @@ -464,9 +465,9 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) .arg = &rmap_walk_arg, }; - VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + VM_BUG_ON_FOLIO((flags & TTU_USE_SHARED_ZEROPAGE) && (src != dst), src); - if (flags & RMP_LOCKED) + if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -1521,8 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, !rc ? dst : src, - ttu ? RMP_LOCKED : 0); + remove_migration_ptes(src, !rc ? dst : src, ttu); if (ttu & TTU_RMAP_LOCKED) i_mmap_unlock_write(mapping); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..0a8b31939640 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -271,7 +271,7 @@ again: ptep = pte_offset_map_lock(mm, pmdp, start, &ptl); if (!ptep) goto again; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); ptep += (addr - start) / PAGE_SIZE; for (; addr < end; addr += PAGE_SIZE, ptep++) { @@ -313,7 +313,7 @@ again: if (folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -356,7 +356,7 @@ again: if (folio && folio_test_large(folio)) { int ret; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep, ptl); ret = migrate_vma_split_folio(folio, migrate->fault_page); @@ -485,7 +485,7 @@ next: if (unmapped) flush_tlb_range(walk->vma, start, end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(ptep - 1, ptl); return 0; @@ -1419,10 +1419,10 @@ EXPORT_SYMBOL(migrate_device_range); /** * migrate_device_pfns() - migrate device private pfns to normal memory. - * @src_pfns: pre-popluated array of source device private pfns to migrate. + * @src_pfns: pre-populated array of source device private pfns to migrate. * @npages: number of pages to migrate. * - * Similar to migrate_device_range() but supports non-contiguous pre-popluated + * Similar to migrate_device_range() but supports non-contiguous pre-populated * array of device pages to migrate. */ int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) diff --git a/mm/mm_init.c b/mm/mm_init.c index 2a809cd8e7fa..1a29a719af58 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -187,7 +187,7 @@ void mm_compute_batch(int overcommit_policy) /* * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of * (total memory/#cpus), and lift it to 25% for other policies - * to easy the possible lock contention for percpu_counter + * to ease the possible lock contention for percpu_counter * vm_committed_as, while the max limit is INT_MAX */ if (overcommit_policy == OVERCOMMIT_NEVER) @@ -646,21 +646,18 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -int hashdist = HASHDIST_DEFAULT; +bool hashdist = HASHDIST_DEFAULT; static int __init set_hashdist(char *str) { - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; + return kstrtobool(str, &hashdist) == 0; } __setup("hashdist=", set_hashdist); static inline void fixup_hashdist(void) { if (num_node_state(N_MEMORY) == 1) - hashdist = 0; + hashdist = false; } #else static inline void fixup_hashdist(void) {} @@ -1748,7 +1745,7 @@ static void __init free_area_init_node(int nid) lru_gen_init_pgdat(pgdat); } -/* Any regular or high memory on that node ? */ +/* Any regular or high memory on that node? */ static void __init check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; @@ -1810,7 +1807,6 @@ static void __init set_high_memory(void) /** * free_area_init - Initialise all pg_data_t and zone data - * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by memblock_set_node(), the size of each @@ -1821,17 +1817,15 @@ static void __init set_high_memory(void) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init(unsigned long *max_zone_pfn) +static void __init free_area_init(void) { + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; unsigned long start_pfn, end_pfn; int i, nid, zone; bool descending; - /* Record where the zone boundaries are */ - memset(arch_zone_lowest_possible_pfn, 0, - sizeof(arch_zone_lowest_possible_pfn)); - memset(arch_zone_highest_possible_pfn, 0, - sizeof(arch_zone_highest_possible_pfn)); + arch_zone_limits_init(max_zone_pfn); + sparse_init(); start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); @@ -2048,7 +2042,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, * Initialize and free pages. * * At this point reserved pages and struct pages that correspond to holes in - * memblock.memory are already intialized so every free range has a valid + * memblock.memory are already initialized so every free range has a valid * memory map around it. * This ensures that access of pages that are ahead of the range being * initialized (computing buddy page in __free_one_page()) always reads a valid @@ -2681,13 +2675,20 @@ void __init __weak mem_init(void) { } +void __init mm_core_init_early(void) +{ + hugetlb_cma_reserve(); + hugetlb_bootmem_alloc(); + + free_area_init(); +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { arch_mm_preinit(); - hugetlb_bootmem_alloc(); /* Initializations relying on SMP setup */ BUILD_BUG_ON(MAX_ZONELISTS > 2); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 7421b7ea8001..898c2ef1e958 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -45,65 +45,111 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released); #ifdef CONFIG_MMU #ifdef CONFIG_PER_VMA_LOCK + +/* State shared across __vma_[start, end]_exclude_readers. */ +struct vma_exclude_readers_state { + /* Input parameters. */ + struct vm_area_struct *vma; + int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ + bool detaching; + + /* Output parameters. */ + bool detached; + bool exclusive; /* Are we exclusively locked? */ +}; + /* - * __vma_enter_locked() returns 0 immediately if the vma is not - * attached, otherwise it waits for any current readers to finish and - * returns 1. Returns -EINTR if a signal is received while waiting. + * Now that all readers have been evicted, mark the VMA as being out of the + * 'exclude readers' state. */ -static inline int __vma_enter_locked(struct vm_area_struct *vma, - bool detaching, int state) +static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves) { - int err; - unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + struct vm_area_struct *vma = ves->vma; - mmap_assert_write_locked(vma->vm_mm); + VM_WARN_ON_ONCE(ves->detached); + + ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, + &vma->vm_refcnt); + __vma_lockdep_release_exclusive(vma); +} + +static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) +{ + const unsigned int tgt = ves->detaching ? 0 : 1; + + return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; +} + +/* + * Mark the VMA as being in a state of excluding readers, check to see if any + * VMA read locks are indeed held, and if so wait for them to be released. + * + * Note that this function pairs with vma_refcount_put() which will wake up this + * thread when it detects that the last reader has released its lock. + * + * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases + * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal + * signal is permitted to kill it. + * + * The function sets the ves->exclusive parameter to true if readers were + * excluded, or false if the VMA was detached or an error arose on wait. + * + * If the function indicates an exclusive lock was acquired via ves->exclusive + * the caller is required to invoke __vma_end_exclude_readers() once the + * exclusive state is no longer required. + * + * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the + * function may also return -EINTR to indicate a fatal signal was received while + * waiting. Otherwise, the function returns 0. + */ +static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) +{ + struct vm_area_struct *vma = ves->vma; + unsigned int tgt_refcnt = get_target_refcnt(ves); + int err = 0; - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; + mmap_assert_write_locked(vma->vm_mm); /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + * + * See the comment describing the vm_area_struct->vm_refcnt field for + * details of possible refcnt values. */ - if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { + ves->detached = true; return 0; + } - rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + __vma_lockdep_acquire_exclusive(vma); err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, - state); + ves->state); if (err) { - if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { - /* - * The wait failed, but the last reader went away - * as well. Tell the caller the VMA is detached. - */ - WARN_ON_ONCE(!detaching); - err = 0; - } - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + __vma_end_exclude_readers(ves); return err; } - lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - return 1; -} - -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) -{ - *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + __vma_lockdep_stat_mark_acquired(vma); + ves->exclusive = true; + return 0; } -int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, - int state) +int __vma_start_write(struct vm_area_struct *vma, int state) { - int locked; + const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma); + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = state, + }; + int err; - locked = __vma_enter_locked(vma, false, state); - if (locked < 0) - return locked; + err = __vma_start_exclude_readers(&ves); + if (err) { + WARN_ON_ONCE(ves.detached); + return err; + } /* * We should use WRITE_ONCE() here because we can have concurrent reads @@ -113,39 +159,42 @@ int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - if (locked) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(detached); /* vma should remain attached */ + if (ves.exclusive) { + __vma_end_exclude_readers(&ves); + /* VMA should remain attached. */ + WARN_ON_ONCE(ves.detached); } return 0; } EXPORT_SYMBOL_GPL(__vma_start_write); -void vma_mark_detached(struct vm_area_struct *vma) +void __vma_exclude_readers_for_detach(struct vm_area_struct *vma) { - vma_assert_write_locked(vma); - vma_assert_attached(vma); + struct vma_exclude_readers_state ves = { + .vma = vma, + .state = TASK_UNINTERRUPTIBLE, + .detaching = true, + }; + int err; /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. + * Wait until the VMA is detached with no readers. Since we hold the VMA + * write lock, the only read locks that might be present are those from + * threads trying to acquire the read lock and incrementing the + * reference count before realising the write lock is held and + * decrementing it. */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { - /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(!detached); - } + err = __vma_start_exclude_readers(&ves); + if (!err && ves.exclusive) { + /* + * Once this is complete, no readers can increment the + * reference count, and the VMA is marked detached. + */ + __vma_end_exclude_readers(&ves); } + /* If an error arose but we were detached anyway, we don't care. */ + WARN_ON_ONCE(!ves.detached); } /* @@ -180,19 +229,21 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * If VM_REFCNT_EXCLUDE_READERS_FLAG is set, + * __refcount_inc_not_zero_limited_acquire() will fail because + * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG. + * * Acquire fence is required here to avoid reordering against later * vm_lock_seq check and checks inside lock_vma_under_rcu(). */ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { + VM_REFCNT_LIMIT))) { /* return EAGAIN if vma got detached from under us */ vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); goto err; } - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + __vma_lockdep_acquire_read(vma); if (unlikely(vma->vm_mm != mm)) goto err_unstable; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7468ec388455..fe5b6a031717 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> @@ -210,10 +211,9 @@ bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, PAGE_SIZE); } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); + return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size); } #endif /* MMU_GATHER_NO_GATHER */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 283889e4f1ce..c0571445bef7 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb, is_private_single_threaded = vma_is_single_threaded_private(vma); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { nr_ptes = 1; oldpte = ptep_get(pte); @@ -379,7 +379,7 @@ static long change_pte_range(struct mmu_gather *tlb, } } } while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte - 1, ptl); return pages; diff --git a/mm/mremap.c b/mm/mremap.c index 672264807db6..8391ae17de64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE, new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) { @@ -305,7 +305,7 @@ static int move_ptes(struct pagetable_move_control *pmc, } } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) @@ -678,7 +678,7 @@ static bool can_realign_addr(struct pagetable_move_control *pmc, /* * We don't want to have to go hunting for VMAs from the end of the old * VMA to the next page table boundary, also we want to make sure the - * operation is wortwhile. + * operation is worthwhile. * * So ensure that we only perform this realignment if the end of the * range being copied reaches or crosses the page table boundary. @@ -926,7 +926,7 @@ static bool vrm_overlaps(struct vma_remap_struct *vrm) /* * Will a new address definitely be assigned? This either if the user specifies * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will - * always detemrine a target address. + * always determine a target address. */ static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { @@ -1806,7 +1806,7 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) /* * move_vma() need us to stay 4 maps below the threshold, otherwise * it will bail out at the very beginning. - * That is a problem if we have already unmaped the regions here + * That is a problem if we have already unmapped the regions here * (new_addr, and old_addr), because userspace will not know the * state of the vma's after it gets -ENOMEM. * So, to avoid such scenario we can pre-compute if the whole diff --git a/mm/mseal.c b/mm/mseal.c index ae442683c5c0..316b5e1dec78 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -21,7 +21,7 @@ * It disallows unmapped regions from start to end whether they exist at the * start, in the middle, or at the end of the range, or any combination thereof. * - * This is because after sealng a range, there's nothing to stop memory mapping + * This is because after sealing a range, there's nothing to stop memory mapping * of ranges in the remaining gaps later, meaning that the user might then * wrongly consider the entirety of the mseal()'d range to be sealed when it * in fact isn't. @@ -124,7 +124,7 @@ static int mseal_apply(struct mm_struct *mm, * -EINVAL: * invalid input flags. * start address is not page aligned. - * Address arange (start + len) overflow. + * Address range (start + len) overflow. * -ENOMEM: * addr is not a valid address (not allocated). * end (start + len) is not a valid address. diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index 8f5735fda0a2..391f53e63ea3 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -467,7 +467,7 @@ int __init numa_memblks_init(int (*init_func)(void), * We reset memblock back to the top-down direction * here because if we configured ACPI_NUMA, we have * parsed SRAT in init_func(). It is ok to have the - * reset here even if we did't configure ACPI_NUMA + * reset here even if we didn't configure ACPI_NUMA * or acpi numa init fails and fallbacks to dummy * numa init. */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5eb11fbba704..5c6c95c169ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -228,7 +228,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + points = get_mm_rss_sum(p->mm) + get_mm_counter_sum(p->mm, MM_SWAPENTS) + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); @@ -402,10 +402,10 @@ static int dump_task(struct task_struct *p, void *arg) pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8lu %9lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - get_mm_counter(task->mm, MM_ANONPAGES), get_mm_counter(task->mm, MM_FILEPAGES), - get_mm_counter(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), + task->tgid, task->mm->total_vm, get_mm_rss_sum(task->mm), + get_mm_counter_sum(task->mm, MM_ANONPAGES), get_mm_counter_sum(task->mm, MM_FILEPAGES), + get_mm_counter_sum(task->mm, MM_SHMEMPAGES), mm_pgtables_bytes(task->mm), + get_mm_counter_sum(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); @@ -458,7 +458,7 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) static void dump_header(struct oom_control *oc) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%d\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) @@ -604,9 +604,9 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES))); + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES))); out_finish: trace_finish_task_reaping(tsk->pid); out_unlock: @@ -958,11 +958,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%d\n", message, task_pid_nr(victim), victim->comm, K(mm->total_vm), - K(get_mm_counter(mm, MM_ANONPAGES)), - K(get_mm_counter(mm, MM_FILEPAGES)), - K(get_mm_counter(mm, MM_SHMEMPAGES)), + K(get_mm_counter_sum(mm, MM_ANONPAGES)), + K(get_mm_counter_sum(mm, MM_FILEPAGES)), + K(get_mm_counter_sum(mm, MM_SHMEMPAGES)), from_kuid(&init_user_ns, task_uid(victim)), mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); task_unlock(victim); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ccdeb0e84d39..601a5e048d12 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -109,14 +109,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ -/* - * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: - * a full sync is triggered after this time elapses without any disk activity. - */ -int laptop_mode; - -EXPORT_SYMBOL(laptop_mode); - /* End of sysctl-exported parameters */ struct wb_domain global_wb_domain; @@ -1843,17 +1835,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, balance_domain_limits(mdtc, strictlimit); } - /* - * In laptop mode, we wait until hitting the higher threshold - * before starting background writeout, and then write out all - * the way down to the lower threshold. So slow writers cause - * minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (!laptop_mode && nr_dirty > gdtc->bg_thresh && - !writeback_in_progress(wb)) + if (nr_dirty > gdtc->bg_thresh && !writeback_in_progress(wb)) wb_start_background_writeback(wb); /* @@ -1876,10 +1858,6 @@ free_running: break; } - /* Start writeback even when in laptop mode */ - if (unlikely(!writeback_in_progress(wb))) - wb_start_background_writeback(wb); - mem_cgroup_flush_foreign(wb); /* @@ -2198,41 +2176,6 @@ static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int } #endif -void laptop_mode_timer_fn(struct timer_list *t) -{ - struct backing_dev_info *backing_dev_info = - timer_container_of(backing_dev_info, t, laptop_mode_wb_timer); - - wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); -} - -/* - * We've spun up the disk and we're in laptop mode: schedule writeback - * of all dirty data a few seconds from now. If the flush is already scheduled - * then push it back - the user is still using the disk. - */ -void laptop_io_completion(struct backing_dev_info *info) -{ - mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); -} - -/* - * We're in laptop mode and we've just synced. The sync's writes will have - * caused another writeback to be scheduled by laptop_io_completion. - * Nothing needs to be written back anymore, so we unschedule the writeback. - */ -void laptop_sync_completion(void) -{ - struct backing_dev_info *bdi; - - rcu_read_lock(); - - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) - timer_delete(&bdi->laptop_mode_wb_timer); - - rcu_read_unlock(); -} - /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. @@ -2263,6 +2206,19 @@ static int page_writeback_cpu_online(unsigned int cpu) #ifdef CONFIG_SYSCTL +static int laptop_mode; +static int laptop_mode_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_jiffies(table, write, buffer, lenp, ppos); + + if (!ret && write) + pr_warn("%s: vm.laptop_mode is deprecated. Ignoring setting.\n", + current->comm); + + return ret; +} + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -2332,7 +2288,7 @@ static const struct ctl_table vm_page_writeback_sysctls[] = { .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = laptop_mode_handler, }, }; #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d312ebaa1e77..5fd9e4a03a4d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/mm/page_alloc.c * * Manages the free list, the system allocates free pages here. * Note that kmalloc() lives in slab.c @@ -1853,7 +1852,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * KASAN unpoisoning and memory initializion code must be + * KASAN unpoisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. */ @@ -2946,9 +2945,9 @@ static bool free_frozen_page_commit(struct zone *zone, * 'hopeless node' to stay in that state for a while. Let * kswapd work again by resetting kswapd_failures. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + if (kswapd_test_hopeless(pgdat) && next_memory_node(pgdat->node_id) < MAX_NUMNODES) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP); } return ret; } @@ -3112,6 +3111,15 @@ void free_unref_folios(struct folio_batch *folios) folio_batch_reinit(folios); } +static void __split_page(struct page *page, unsigned int order) +{ + VM_WARN_ON_PAGE(PageCompound(page), page); + + split_page_owner(page, order, 0); + pgalloc_tag_split(page_folio(page), order, 0); + split_page_memcg(page, order); +} + /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] @@ -3124,14 +3132,12 @@ void split_page(struct page *page, unsigned int order) { int i; - VM_BUG_ON_PAGE(PageCompound(page), page); - VM_BUG_ON_PAGE(!page_count(page), page); + VM_WARN_ON_PAGE(!page_count(page), page); for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order, 0); - pgalloc_tag_split(page_folio(page), order, 0); - split_page_memcg(page, order); + + __split_page(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4699,7 +4705,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; - bool can_compact = gfp_compaction_allowed(gfp_mask); + bool can_compact = can_direct_reclaim && gfp_compaction_allowed(gfp_mask); bool nofail = gfp_mask & __GFP_NOFAIL; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; @@ -4712,6 +4718,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; unsigned int zonelist_iter_cookie; int reserve_flags; + bool compact_first = false; + bool can_retry_reserves = true; if (unlikely(nofail)) { /* @@ -4736,6 +4744,19 @@ restart: zonelist_iter_cookie = zonelist_iter_begin(); /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + */ + if (can_compact && (costly_order || (order > 0 && + ac->migratetype != MIGRATE_MOVABLE))) { + compact_first = true; + compact_priority = INIT_COMPACT_PRIORITY; + } + + /* * The fast path uses conservative alloc_flags to succeed only until * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. @@ -4766,6 +4787,8 @@ restart: goto nopage; } +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); @@ -4777,74 +4800,6 @@ restart: if (page) goto got_pg; - /* - * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. For non- - * movable high-order allocations, do that as well, as compaction will - * try prevent permanent fragmentation by migrating from blocks of the - * same migratetype. - * Don't try this for allocations that are allowed to ignore - * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. - */ - if (can_direct_reclaim && can_compact && - (costly_order || - (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) - && !gfp_pfmemalloc_allowed(gfp_mask)) { - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, - INIT_COMPACT_PRIORITY, - &compact_result); - if (page) - goto got_pg; - - /* - * Checks for costly allocations with __GFP_NORETRY, which - * includes some THP page fault allocations - */ - if (costly_order && (gfp_mask & __GFP_NORETRY)) { - /* - * If allocating entire pageblock(s) and compaction - * failed because all zones are below low watermarks - * or is prohibited because it recently failed at this - * order, fail immediately unless the allocator has - * requested compaction and reclaim retry. - * - * Reclaim is - * - potentially very expensive because zones are far - * below their low watermarks or this is part of very - * bursty high order allocations, - * - not guaranteed to help because isolate_freepages() - * may not iterate over freed pages as part of its - * linear scan, and - * - unlikely to make entire pageblocks free on its - * own. - */ - if (compact_result == COMPACT_SKIPPED || - compact_result == COMPACT_DEFERRED) - goto nopage; - - /* - * Looks like reclaim/compaction is worth trying, but - * sync compaction could be very expensive, so keep - * using async compaction. - */ - compact_priority = INIT_COMPACT_PRIORITY; - } - } - -retry: - /* - * Deal with possible cpuset update races or zonelist updates to avoid - * infinite retries. - */ - if (check_retry_cpuset(cpuset_mems_cookie, ac) || - check_retry_zonelist(zonelist_iter_cookie)) - goto restart; - - /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ - if (alloc_flags & ALLOC_KSWAPD) - wake_all_kswapds(order, gfp_mask, ac); - reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | @@ -4859,12 +4814,18 @@ retry: ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); - } - /* Attempt with potentially adjusted zonelist and alloc_flags */ - page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) - goto got_pg; + /* + * The first time we adjust anything due to being allowed to + * ignore memory policies or watermarks, retry immediately. This + * allows us to keep the first allocation attempt optimistic so + * it can succeed in a zone that is still above watermarks. + */ + if (can_retry_reserves) { + can_retry_reserves = false; + goto retry; + } + } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) @@ -4875,10 +4836,12 @@ retry: goto nopage; /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, - &did_some_progress); - if (page) - goto got_pg; + if (!compact_first) { + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, + ac, &did_some_progress); + if (page) + goto got_pg; + } /* Try direct compaction and then allocating */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, @@ -4886,6 +4849,33 @@ retry: if (page) goto got_pg; + if (compact_first) { + /* + * THP page faults may attempt local node only first, but are + * then allowed to only compact, not reclaim, see + * alloc_pages_mpol(). + * + * Compaction has failed above and we don't want such THP + * allocations to put reclaim pressure on a single node in a + * situation where other nodes might have plenty of available + * memory. + */ + if (gfp_has_flags(gfp_mask, __GFP_NORETRY | __GFP_THISNODE)) + goto nopage; + + /* + * For the initial compaction attempt we have lowered its + * priority. Restore it for further retries, if those are + * allowed. With __GFP_NORETRY there will be a single round of + * reclaim and compaction with the lowered priority. + */ + if (!(gfp_mask & __GFP_NORETRY)) + compact_priority = DEF_COMPACT_PRIORITY; + + compact_first = false; + goto retry; + } + /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) goto nopage; @@ -4898,6 +4888,15 @@ retry: !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. No "goto retry;" can be placed above this check + * unless it can execute just once. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -5401,9 +5400,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, order, 0); - pgalloc_tag_split(page_folio(page), order, 0); - split_page_memcg(page, order); + __split_page(page, order); while (page < --last) set_page_refcounted(last); @@ -6896,7 +6893,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list, gfp_t gfp_mask) +static void split_free_frozen_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6908,11 +6905,10 @@ static void split_free_pages(struct list_head *list, gfp_t gfp_mask) int i; post_alloc_hook(page, order, gfp_mask); - set_page_refcounted(page); if (!order) continue; - split_page(page, order); + __split_page(page, order); /* Add all subpages to the order-0 head, in sequence. */ list_del(&page->lru); @@ -6956,8 +6952,14 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) return 0; } +static void __free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) +{ + for (; nr_pages--; pfn++) + free_frozen_pages(pfn_to_page(pfn), 0); +} + /** - * alloc_contig_range() -- tries to allocate given range of pages + * alloc_contig_frozen_range() -- tries to allocate given range of frozen pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate * @alloc_flags: allocation information @@ -6972,12 +6974,15 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) * pageblocks in the range. Once isolated, the pageblocks should not * be modified by others. * - * Return: zero on success or negative error code. On success all - * pages which PFN is in [start, end) are allocated for the caller and - * need to be freed with free_contig_range(). + * All frozen pages which PFN is in [start, end) are allocated for the + * caller, and they could be freed with free_contig_frozen_range(), + * free_frozen_pages() also could be used to free compound frozen pages + * directly. + * + * Return: zero on success or negative error code. */ -int alloc_contig_range_noprof(unsigned long start, unsigned long end, - acr_flags_t alloc_flags, gfp_t gfp_mask) +int alloc_contig_frozen_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask) { const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; @@ -7093,19 +7098,18 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages, gfp_mask); + split_free_frozen_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) - free_contig_range(outer_start, start - outer_start); + __free_contig_frozen_range(outer_start, start - outer_start); if (end != outer_end) - free_contig_range(end, outer_end - end); + __free_contig_frozen_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); - set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -7115,36 +7119,86 @@ done: undo_isolate_page_range(start, end); return ret; } -EXPORT_SYMBOL(alloc_contig_range_noprof); +EXPORT_SYMBOL(alloc_contig_frozen_range_noprof); -static int __alloc_contig_pages(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @alloc_flags: allocation information + * @gfp_mask: GFP mask. + * + * This routine is a wrapper around alloc_contig_frozen_range(), it can't + * be used to allocate compound pages, the refcount of each allocated page + * will be set to one. + * + * All pages which PFN is in [start, end) are allocated for the caller, + * and should be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: zero on success or negative error code. + */ +int alloc_contig_range_noprof(unsigned long start, unsigned long end, + acr_flags_t alloc_flags, gfp_t gfp_mask) { - unsigned long end_pfn = start_pfn + nr_pages; + int ret; - return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE, - gfp_mask); + if (WARN_ON(gfp_mask & __GFP_COMP)) + return -EINVAL; + + ret = alloc_contig_frozen_range_noprof(start, end, alloc_flags, gfp_mask); + if (!ret) + set_pages_refcounted(pfn_to_page(start), end - start); + + return ret; } +EXPORT_SYMBOL(alloc_contig_range_noprof); static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, bool skip_hugetlb, + bool *skipped_hugetlb) { - unsigned long i, end_pfn = start_pfn + nr_pages; + unsigned long end_pfn = start_pfn + nr_pages; struct page *page; - for (i = start_pfn; i < end_pfn; i++) { - page = pfn_to_online_page(i); + while (start_pfn < end_pfn) { + unsigned long step = 1; + + page = pfn_to_online_page(start_pfn); if (!page) return false; if (page_zone(page) != z) return false; - if (PageReserved(page)) + if (page_is_unmovable(z, page, PB_ISOLATE_MODE_OTHER, &step)) return false; - if (PageHuge(page)) - return false; + /* + * Only consider ranges containing hugepages if those pages are + * smaller than the requested contiguous region. e.g.: + * Move 2MB pages to free up a 1GB range. + * Don't move 1GB pages to free up a 2MB range. + * + * This makes contiguous allocation more reliable if multiple + * hugepage sizes are used without causing needless movement. + */ + if (PageHuge(page)) { + unsigned int order; + + if (skip_hugetlb) { + *skipped_hugetlb = true; + return false; + } + + page = compound_head(page); + order = compound_order(page); + if ((order >= MAX_FOLIO_ORDER) || + (nr_pages <= (1 << order))) + return false; + } + + start_pfn += step; } return true; } @@ -7158,7 +7212,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, } /** - * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * alloc_contig_frozen_pages() -- tries to find and allocate contiguous range of frozen pages * @nr_pages: Number of contiguous pages to allocate * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some * action and reclaim modifiers are supported. Reclaim modifiers @@ -7166,28 +7220,34 @@ static bool zone_spans_last_pfn(const struct zone *zone, * @nid: Target node * @nodemask: Mask for other possible nodes * - * This routine is a wrapper around alloc_contig_range(). It scans over zones - * on an applicable zonelist to find a contiguous pfn range which can then be - * tried for allocation with alloc_contig_range(). This routine is intended - * for allocation requests which can not be fulfilled with the buddy allocator. + * This routine is a wrapper around alloc_contig_frozen_range(). It scans over + * zones on an applicable zonelist to find a contiguous pfn range which can then + * be tried for allocation with alloc_contig_frozen_range(). This routine is + * intended for allocation requests which can not be fulfilled with the buddy + * allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a * power of two, then allocated range is also guaranteed to be aligned to same * nr_pages (e.g. 1GB request would be aligned to 1GB). * - * Allocated pages can be freed with free_contig_range() or by manually calling - * __free_page() on each allocated page. + * Allocated frozen pages need be freed with free_contig_frozen_range(), + * or by manually calling free_frozen_pages() on each allocated frozen + * non-compound page, for compound frozen pages could be freed with + * free_frozen_pages() directly. * - * Return: pointer to contiguous pages on success, or NULL if not successful. + * Return: pointer to contiguous frozen pages on success, or NULL if not successful. */ -struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages, + gfp_t gfp_mask, int nid, nodemask_t *nodemask) { unsigned long ret, pfn, flags; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + bool skip_hugetlb = true; + bool skipped_hugetlb = false; +retry: zonelist = node_zonelist(nid, gfp_mask); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { @@ -7195,16 +7255,20 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, pfn = ALIGN(zone->zone_start_pfn, nr_pages); while (zone_spans_last_pfn(zone, pfn, nr_pages)) { - if (pfn_range_valid_contig(zone, pfn, nr_pages)) { + if (pfn_range_valid_contig(zone, pfn, nr_pages, + skip_hugetlb, + &skipped_hugetlb)) { /* * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... + * alloc_contig_frozen_range() will also lock + * the zone at some point. If there's an + * allocation spinning on this lock, it may + * win the race and cause allocation to fail. */ spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_contig_pages(pfn, nr_pages, + ret = alloc_contig_frozen_range_noprof(pfn, + pfn + nr_pages, + ACR_FLAGS_NONE, gfp_mask); if (!ret) return pfn_to_page(pfn); @@ -7214,35 +7278,96 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, } spin_unlock_irqrestore(&zone->lock, flags); } + /* + * If we failed, retry the search, but treat regions with HugeTLB pages + * as valid targets. This retains fast-allocations on first pass + * without trying to migrate HugeTLB pages (which may fail). On the + * second pass, we will try moving HugeTLB pages when those pages are + * smaller than the requested contiguous region size. + */ + if (skip_hugetlb && skipped_hugetlb) { + skip_hugetlb = false; + goto retry; + } return NULL; } -#endif /* CONFIG_CONTIG_ALLOC */ +EXPORT_SYMBOL(alloc_contig_frozen_pages_noprof); -void free_contig_range(unsigned long pfn, unsigned long nr_pages) +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask. + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_frozen_pages(), it can't + * be used to allocate compound pages, the refcount of each allocated page + * will be set to one. + * + * Allocated pages can be freed with free_contig_range() or by manually + * calling __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { - unsigned long count = 0; - struct folio *folio = pfn_folio(pfn); + struct page *page; - if (folio_test_large(folio)) { - int expected = folio_nr_pages(folio); + if (WARN_ON(gfp_mask & __GFP_COMP)) + return NULL; - if (nr_pages == expected) - folio_put(folio); - else - WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", - pfn, nr_pages, expected); + page = alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid, + nodemask); + if (page) + set_pages_refcounted(page, nr_pages); + + return page; +} +EXPORT_SYMBOL(alloc_contig_pages_noprof); + +/** + * free_contig_frozen_range() -- free the contiguous range of frozen pages + * @pfn: start PFN to free + * @nr_pages: Number of contiguous frozen pages to free + * + * This can be used to free the allocated compound/non-compound frozen pages. + */ +void free_contig_frozen_range(unsigned long pfn, unsigned long nr_pages) +{ + struct page *first_page = pfn_to_page(pfn); + const unsigned int order = ilog2(nr_pages); + + if (WARN_ON_ONCE(first_page != compound_head(first_page))) + return; + + if (PageHead(first_page)) { + WARN_ON_ONCE(order != compound_order(first_page)); + free_frozen_pages(first_page, order); return; } - for (; nr_pages--; pfn++) { - struct page *page = pfn_to_page(pfn); + __free_contig_frozen_range(pfn, nr_pages); +} +EXPORT_SYMBOL(free_contig_frozen_range); - count += page_count(page) != 1; - __free_page(page); - } - WARN(count != 0, "%lu pages are still in use!\n", count); +/** + * free_contig_range() -- free the contiguous range of pages + * @pfn: start PFN to free + * @nr_pages: Number of contiguous pages to free + * + * This can be only used to free the allocated non-compound pages. + */ +void free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn)))) + return; + + for (; nr_pages--; pfn++) + __free_page(pfn_to_page(pfn)); } EXPORT_SYMBOL(free_contig_range); +#endif /* CONFIG_CONTIG_ALLOC */ /* * Effectively disable pcplists for the zone by setting the high limit to 0 @@ -7658,7 +7783,7 @@ struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned * unsafe in NMI. If spin_trylock() is called from hard IRQ the current * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will * mark the task as the owner of another rt_spin_lock which will - * confuse PI logic, so return immediately if called form hard IRQ or + * confuse PI logic, so return immediately if called from hard IRQ or * NMI. * * Note, irqs_disabled() case is ok. This function can be called diff --git a/mm/page_io.c b/mm/page_io.c index 3c342db77ce3..a2c034660c80 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -450,14 +450,14 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), + * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ if (data_race(sis->flags & SWP_FS_OPS)) swap_writepage_fs(folio, swap_plug); /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), + * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race * is safe. */ diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f72b6cd38b95..c48ff5c00244 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,6 +15,100 @@ #define CREATE_TRACE_POINTS #include <trace/events/page_isolation.h> +bool page_is_unmovable(struct zone *zone, struct page *page, + enum pb_isolate_mode mode, unsigned long *step) +{ + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ + if (PageReserved(page)) + return true; + + /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return false; + + /* + * Hugepages are not in LRU lists, but they're movable. + * THPs are on the LRU, but need to be counted as #small pages. + * We need not scan over tail pages because we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page) || PageCompound(page)) { + struct folio *folio = page_folio(page); + + if (folio_test_hugetlb(folio)) { + struct hstate *h; + + if (!IS_ENABLED(CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION)) + return true; + + /* + * The huge page may be freed so can not + * use folio_hstate() directly. + */ + h = size_to_hstate(folio_size(folio)); + if (h && !hugepage_migration_supported(h)) + return true; + + } else if (!folio_test_lru(folio)) { + return true; + } + + *step = folio_nr_pages(folio) - folio_page_idx(folio, page); + return false; + } + + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_refcount is zero at all time. + */ + if (!page_ref_count(page)) { + if (PageBuddy(page)) + *step = (1 << buddy_order(page)); + return false; + } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) + return false; + + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) + return false; + + if (PageLRU(page) || page_has_movable_ops(page)) + return false; + + /* + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. + */ + return true; +} + /* * This function checks whether the range [start_pfn, end_pfn) includes * unmovable pages or not. The range must fall into a single pageblock and @@ -35,7 +129,6 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e { struct page *page = pfn_to_page(start_pfn); struct zone *zone = page_zone(page); - unsigned long pfn; VM_BUG_ON(pageblock_start_pfn(start_pfn) != pageblock_start_pfn(end_pfn - 1)); @@ -52,96 +145,14 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e return page; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - page = pfn_to_page(pfn); + while (start_pfn < end_pfn) { + unsigned long step = 1; - /* - * Both, bootmem allocations and memory holes are marked - * PG_reserved and are unmovable. We can even have unmovable - * allocations inside ZONE_MOVABLE, for example when - * specifying "movablecore". - */ - if (PageReserved(page)) + page = pfn_to_page(start_pfn); + if (page_is_unmovable(zone, page, mode, &step)) return page; - /* - * If the zone is movable and we have ruled out all reserved - * pages then it should be reasonably safe to assume the rest - * is movable. - */ - if (zone_idx(zone) == ZONE_MOVABLE) - continue; - - /* - * Hugepages are not in LRU lists, but they're movable. - * THPs are on the LRU, but need to be counted as #small pages. - * We need not scan over tail pages because we don't - * handle each tail page individually in migration. - */ - if (PageHuge(page) || PageTransCompound(page)) { - struct folio *folio = page_folio(page); - unsigned int skip_pages; - - if (PageHuge(page)) { - struct hstate *h; - - /* - * The huge page may be freed so can not - * use folio_hstate() directly. - */ - h = size_to_hstate(folio_size(folio)); - if (h && !hugepage_migration_supported(h)) - return page; - } else if (!folio_test_lru(folio)) { - return page; - } - - skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page); - pfn += skip_pages - 1; - continue; - } - - /* - * We can't use page_count without pin a page - * because another CPU can free compound page. - * This check already skips compound tails of THP - * because their page->_refcount is zero at all time. - */ - if (!page_ref_count(page)) { - if (PageBuddy(page)) - pfn += (1 << buddy_order(page)) - 1; - continue; - } - - /* - * The HWPoisoned page may be not in buddy system, and - * page_count() is not 0. - */ - if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page)) - continue; - - /* - * We treat all PageOffline() pages as movable when offlining - * to give drivers a chance to decrement their reference count - * in MEM_GOING_OFFLINE in order to indicate that these pages - * can be offlined as there are no direct references anymore. - * For actually unmovable PageOffline() where the driver does - * not support this, we will fail later when trying to actually - * move these pages that still have a reference count > 0. - * (false negatives in this function only) - */ - if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page)) - continue; - - if (PageLRU(page) || page_has_movable_ops(page)) - continue; - - /* - * If there are RECLAIMABLE pages, we need to check - * it. But now, memory offline itself doesn't call - * shrink_node_slabs() and it still to be fixed. - */ - return page; + start_pfn += step; } return NULL; } @@ -301,7 +312,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * pageblock. When not all pageblocks within a page are isolated at the same * time, free page accounting can go wrong. For example, in the case of * MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two - * pagelbocks. + * pageblocks. * [ MAX_PAGE_ORDER ] * [ pageblock0 | pageblock1 ] * When either pageblock is isolated, if it is a free page, the page is not diff --git a/mm/page_reporting.c b/mm/page_reporting.c index e4c428e61d8c..8a03effda749 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -123,7 +123,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev, continue; /* - * If page was not comingled with another page we can + * If page was not commingled with another page we can * consider the result to be "reported" since the page * hasn't been modified, otherwise we will need to * report on the new larger page when we make our way diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 741884645ab0..2708c2b3ac1f 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -145,34 +145,37 @@ void __page_table_check_zero(struct page *page, unsigned int order) rcu_read_unlock(); } -void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) { if (&init_mm == mm) return; - if (pte_user_accessible_page(pte)) { + if (pte_user_accessible_page(pte, addr)) { page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) { if (&init_mm == mm) return; - if (pmd_user_accessible_page(pmd)) { + if (pmd_user_accessible_page(pmd, addr)) { page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); -void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) { if (&init_mm == mm) return; - if (pud_user_accessible_page(pud)) { + if (pud_user_accessible_page(pud, addr)) { page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } @@ -196,8 +199,8 @@ static void page_table_check_pte_flags(pte_t pte) } } -void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, - unsigned int nr) +void __page_table_check_ptes_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; @@ -207,8 +210,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) - __page_table_check_pte_clear(mm, ptep_get(ptep + i)); - if (pte_user_accessible_page(pte)) + __page_table_check_pte_clear(mm, addr + PAGE_SIZE * i, ptep_get(ptep + i)); + if (pte_user_accessible_page(pte, addr)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); @@ -225,8 +228,8 @@ static inline void page_table_check_pmd_flags(pmd_t pmd) } } -void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, - unsigned int nr) +void __page_table_check_pmds_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) { unsigned long stride = PMD_SIZE >> PAGE_SHIFT; unsigned int i; @@ -237,14 +240,14 @@ void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, page_table_check_pmd_flags(pmd); for (i = 0; i < nr; i++) - __page_table_check_pmd_clear(mm, *(pmdp + i)); - if (pmd_user_accessible_page(pmd)) + __page_table_check_pmd_clear(mm, addr + PMD_SIZE * i, *(pmdp + i)); + if (pmd_user_accessible_page(pmd, addr)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } EXPORT_SYMBOL(__page_table_check_pmds_set); -void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, - unsigned int nr) +void __page_table_check_puds_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud, unsigned int nr) { unsigned long stride = PUD_SIZE >> PAGE_SHIFT; unsigned int i; @@ -253,8 +256,8 @@ void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, return; for (i = 0; i < nr; i++) - __page_table_check_pud_clear(mm, *(pudp + i)); - if (pud_user_accessible_page(pud)) + __page_table_check_pud_clear(mm, addr + PUD_SIZE * i, *(pudp + i)); + if (pud_user_accessible_page(pud, addr)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } EXPORT_SYMBOL(__page_table_check_puds_set); @@ -273,7 +276,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, ptep_get(ptep)); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 90cc346a6ecf..a94c401ab2cf 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -313,7 +313,8 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) { unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); - return boundary < end ? boundary : end; + + return min(boundary, end); } static int walk_hugetlb_range(unsigned long addr, unsigned long end, diff --git a/mm/percpu.c b/mm/percpu.c index 81462ce5866e..a2107bdebf0b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1279,12 +1279,16 @@ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); - pcpu_stats_area_dealloc(chunk); oslot = pcpu_chunk_slot(chunk); bit_off = off / PCPU_MIN_ALLOC_SIZE; + /* check invalid free */ + if (!test_bit(bit_off, chunk->alloc_map) || + !test_bit(bit_off, chunk->bound_map)) + return 0; + /* find end index */ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); @@ -1303,6 +1307,8 @@ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) pcpu_chunk_relocate(chunk, oslot); + pcpu_stats_area_dealloc(chunk); + return freed; } @@ -2242,6 +2248,13 @@ void free_percpu(void __percpu *ptr) spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); + if (size == 0) { + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* invalid percpu free */ + WARN_ON_ONCE(1); + return; + } pcpu_alloc_tag_free_hook(chunk, off, size); diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c deleted file mode 100644 index 0d9cfbf4fe5d..000000000000 --- a/mm/pt_reclaim.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/hugetlb.h> -#include <linux/pgalloc.h> - -#include <asm-generic/tlb.h> - -#include "internal.h" - -bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, - struct zap_details *details) -{ - return details && details->reclaim_pt && (end - start >= PMD_SIZE); -} - -bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) -{ - spinlock_t *pml = pmd_lockptr(mm, pmd); - - if (!spin_trylock(pml)) - return false; - - *pmdval = pmdp_get_lockless(pmd); - pmd_clear(pmd); - spin_unlock(pml); - - return true; -} - -void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, - pmd_t pmdval) -{ - pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); - mm_dec_nr_ptes(mm); -} - -void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, - struct mmu_gather *tlb) -{ - pmd_t pmdval; - spinlock_t *pml, *ptl = NULL; - pte_t *start_pte, *pte; - int i; - - pml = pmd_lock(mm, pmd); - start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl); - if (!start_pte) - goto out_ptl; - if (ptl != pml) - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); - - /* Check if it is empty PTE page */ - for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { - if (!pte_none(ptep_get(pte))) - goto out_ptl; - } - pte_unmap(start_pte); - - pmd_clear(pmd); - - if (ptl != pml) - spin_unlock(ptl); - spin_unlock(pml); - - free_pte(mm, addr, tlb, pmdval); - - return; -out_ptl: - if (start_pte) - pte_unmap_unlock(start_pte, ptl); - if (ptl != pml) - spin_unlock(pml); -} diff --git a/mm/readahead.c b/mm/readahead.c index f43d03558e62..7b05082c89ea 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -439,7 +439,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as - * it approaches max_readhead. + * it approaches max_readahead. */ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, diff --git a/mm/rmap.c b/mm/rmap.c index 7b9879ef442d..ab099405151f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1,8 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> - * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. @@ -82,6 +82,7 @@ #include <trace/events/migrate.h> #include "internal.h" +#include "swap.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; @@ -146,14 +147,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } -static void anon_vma_chain_link(struct vm_area_struct *vma, - struct anon_vma_chain *avc, - struct anon_vma *anon_vma) +static void anon_vma_chain_assign(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); - anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** @@ -210,7 +210,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma) spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_chain_assign(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; @@ -231,97 +232,141 @@ int __anon_vma_prepare(struct vm_area_struct *vma) return -ENOMEM; } -/* - * This is a useful helper function for locking the anon_vma root as - * we traverse the vma->anon_vma_chain, looping over anon_vma's that - * have the same vma. - * - * Such anon_vma's should have the same root, so you'd expect to see - * just a single mutex_lock for the whole traversal. - */ -static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) -{ - struct anon_vma *new_root = anon_vma->root; - if (new_root != root) { - if (WARN_ON_ONCE(root)) - up_write(&root->rwsem); - root = new_root; - down_write(&root->rwsem); - } - return root; +static void check_anon_vma_clone(struct vm_area_struct *dst, + struct vm_area_struct *src, + enum vma_operation operation) +{ + /* The write lock must be held. */ + mmap_assert_write_locked(src->vm_mm); + /* If not a fork then must be on same mm. */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm); + + /* If we have anything to do src->anon_vma must be provided. */ + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain)); + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma); + /* We are establishing a new anon_vma_chain. */ + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain)); + /* + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma + * must be the same across dst and src. + */ + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma); + /* + * Essentially equivalent to above - if not a no-op, we should expect + * dst->anon_vma to be set for everything except a fork. + */ + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma && + !dst->anon_vma); + /* For the anon_vma to be compatible, it can only be singular. */ + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED && + !list_is_singular(&src->anon_vma_chain)); +#ifdef CONFIG_PER_VMA_LOCK + /* Only merging an unfaulted VMA leaves the destination attached. */ + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED && + vma_is_attached(dst)); +#endif } -static inline void unlock_anon_vma_root(struct anon_vma *root) +static void maybe_reuse_anon_vma(struct vm_area_struct *dst, + struct anon_vma *anon_vma) { - if (root) - up_write(&root->rwsem); + /* If already populated, nothing to do.*/ + if (dst->anon_vma) + return; + + /* + * We reuse an anon_vma if any linking VMAs were unmapped and it has + * only a single child at most. + */ + if (anon_vma->num_active_vmas > 0) + return; + if (anon_vma->num_children > 1) + return; + + dst->anon_vma = anon_vma; + anon_vma->num_active_vmas++; } -/* - * Attach the anon_vmas from src to dst. - * Returns 0 on success, -ENOMEM on failure. - * - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before - * call, we can identify this case by checking (!dst->anon_vma && - * src->anon_vma). - * - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find - * and reuse existing anon_vma which has no vmas and only one child anon_vma. - * This prevents degradation of anon_vma hierarchy to endless linear chain in - * case of constantly forking task. On the other hand, an anon_vma with more - * than one child isn't reused even if there was no alive vma, thus rmap - * walker has a good chance of avoiding scanning the whole hierarchy when it - * searches where page is mapped. +static void cleanup_partial_anon_vmas(struct vm_area_struct *vma); + +/** + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to + * all of the anon_vma objects contained within @src anon_vma_chain's. + * @dst: The destination VMA with an empty anon_vma_chain. + * @src: The source VMA we wish to duplicate. + * @operation: The type of operation which resulted in the clone. + * + * This is the heart of the VMA side of the anon_vma implementation - we invoke + * this function whenever we need to set up a new VMA's anon_vma state. + * + * This is invoked for: + * + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we + * clone @src into @dst. + * - VMA split. + * - VMA (m)remap. + * - Fork of faulted VMA. + * + * In all cases other than fork this is simply a duplication. Fork additionally + * adds a new active anon_vma. + * + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them + * but do have a single child. This is to avoid waste of memory when repeatedly + * forking. + * + * Returns: 0 on success, -ENOMEM on failure. */ -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src, + enum vma_operation operation) { struct anon_vma_chain *avc, *pavc; - struct anon_vma *root = NULL; - - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { - struct anon_vma *anon_vma; - - avc = anon_vma_chain_alloc(GFP_NOWAIT); - if (unlikely(!avc)) { - unlock_anon_vma_root(root); - root = NULL; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto enomem_failure; - } - anon_vma = pavc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); - anon_vma_chain_link(dst, avc, anon_vma); + struct anon_vma *active_anon_vma = src->anon_vma; - /* - * Reuse existing anon_vma if it has no vma and only one - * anon_vma child. - * - * Root anon_vma is never reused: - * it has self-parent reference and at least one child. - */ - if (!dst->anon_vma && src->anon_vma && - anon_vma->num_children < 2 && - anon_vma->num_active_vmas == 0) - dst->anon_vma = anon_vma; + check_anon_vma_clone(dst, src, operation); + + if (!active_anon_vma) + return 0; + + /* + * Allocate AVCs. We don't need an anon_vma lock for this as we + * are not updating the anon_vma rbtree nor are we changing + * anon_vma statistics. + * + * Either src, dst have the same mm for which we hold an exclusive mmap + * write lock, or we are forking and we hold it on src->vm_mm and dst is + * not yet accessible to other threads so there's no possibliity of the + * unlinked AVC's being observed yet. + */ + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + + anon_vma_chain_assign(dst, avc, pavc->anon_vma); } - if (dst->anon_vma) + + /* + * Now link the anon_vma's back to the newly inserted AVCs. + * Note that all anon_vma's share the same root. + */ + anon_vma_lock_write(src->anon_vma); + list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); + if (operation == VMA_OP_FORK) + maybe_reuse_anon_vma(dst, anon_vma); + } + + if (operation != VMA_OP_FORK) dst->anon_vma->num_active_vmas++; - unlock_anon_vma_root(root); + + anon_vma_unlock_write(active_anon_vma); return 0; enomem_failure: - /* - * dst->anon_vma is dropped here otherwise its num_active_vmas can - * be incorrectly decremented in unlink_anon_vmas(). - * We can safely do this because callers of anon_vma_clone() don't care - * about dst->anon_vma if anon_vma_clone() failed. - */ - dst->anon_vma = NULL; - unlink_anon_vmas(dst); + cleanup_partial_anon_vmas(dst); return -ENOMEM; } @@ -334,7 +379,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; - int error; + int rc; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) @@ -343,27 +388,35 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; + anon_vma = anon_vma_alloc(); + if (!anon_vma) + return -ENOMEM; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) { + put_anon_vma(anon_vma); + return -ENOMEM; + } + /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ - error = anon_vma_clone(vma, pvma); - if (error) - return error; - - /* An existing anon_vma has been reused, all done then. */ - if (vma->anon_vma) - return 0; + rc = anon_vma_clone(vma, pvma, VMA_OP_FORK); + /* An error arose or an existing anon_vma was reused, all done then. */ + if (rc || vma->anon_vma) { + put_anon_vma(anon_vma); + anon_vma_chain_free(avc); + return rc; + } - /* Then add our own anon_vma. */ - anon_vma = anon_vma_alloc(); - if (!anon_vma) - goto out_error; - anon_vma->num_active_vmas++; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_error_free_anon_vma; + /* + * OK no reuse, so add our own anon_vma. + * + * Since it is not linked anywhere we can safely manipulate anon_vma + * fields without a lock. + */ + anon_vma->num_active_vmas = 1; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. @@ -378,24 +431,59 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; + anon_vma_chain_assign(vma, avc, anon_vma); + /* Now let rmap see it. */ anon_vma_lock_write(anon_vma); - anon_vma_chain_link(vma, avc, anon_vma); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; +} - out_error_free_anon_vma: - put_anon_vma(anon_vma); - out_error: - unlink_anon_vmas(vma); - return -ENOMEM; +/* + * In the unfortunate case of anon_vma_clone() failing to allocate memory we + * have to clean things up. + * + * Since we allocate anon_vma_chain's before we insert them into the interval + * trees, we simply have to free up the AVC's and remove the entries from the + * VMA's anon_vma_chain. + */ +static void cleanup_partial_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } } +/** + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing + * anon_vma_chain objects. + * @vma: The VMA whose links to anon_vma objects is to be severed. + * + * As part of the process anon_vma_chain's are freed, + * anon_vma->num_children,num_active_vmas is updated as required and, if the + * relevant anon_vma references no further VMAs, its reference count is + * decremented. + */ void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; - struct anon_vma *root = NULL; + struct anon_vma *active_anon_vma = vma->anon_vma; + + /* Always hold mmap lock, read-lock on unmap possibly. */ + mmap_assert_locked(vma->vm_mm); + + /* Unfaulted is a no-op. */ + if (!active_anon_vma) { + VM_WARN_ON_ONCE(!list_empty(&vma->anon_vma_chain)); + return; + } + + anon_vma_lock_write(active_anon_vma); /* * Unlink each anon_vma chained to the VMA. This list is ordered @@ -404,7 +492,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* @@ -419,16 +506,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) { - vma->anon_vma->num_active_vmas--; - /* - * vma would still be needed after unlink, and anon_vma will be prepared - * when handle fault. - */ - vma->anon_vma = NULL; - } - unlock_anon_vma_root(root); + active_anon_vma->num_active_vmas--; + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + anon_vma_unlock_write(active_anon_vma); + /* * Iterate the list once more, it now only contains empty and unlinked @@ -2147,7 +2233,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto discard; } - if (swap_duplicate(entry) < 0) { + if (folio_dup_swap(folio, subpage) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2158,7 +2244,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2166,7 +2252,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } diff --git a/mm/shmem.c b/mm/shmem.c index 063b4c3e4ccb..c40d786a21c6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Resizable virtual memory filesystem for Linux. * @@ -17,8 +18,6 @@ * * tiny-shmem: * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> - * - * This file is released under the GPL. */ #include <linux/fs.h> @@ -983,7 +982,7 @@ static long shmem_free_swap(struct address_space *mapping, xas_unlock_irq(&xas); if (nr_pages) - free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages); + swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages); return nr_pages; } @@ -1622,11 +1621,23 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, } if (split) { + int order; + try_split: + order = folio_order(folio); /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order >= HPAGE_PMD_ORDER) { + count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } +#endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); + folio_clear_dirty(folio); } @@ -1684,7 +1695,7 @@ try_split: spin_unlock(&shmem_swaplist_lock); } - swap_shmem_alloc(folio->swap, nr_pages); + folio_dup_swap(folio, NULL); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); @@ -1705,7 +1716,7 @@ try_split: /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); - swap_free_nr(folio->swap, nr_pages); + folio_put_swap(folio, NULL); } /* @@ -2031,10 +2042,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new, *swapcache; int nr_pages = 1 << order; - struct folio *new; gfp_t alloc_gfp; - void *shadow; /* * We have arrived here because our zones are constrained, so don't @@ -2074,34 +2084,19 @@ retry: goto fallback; } - /* - * Prevent parallel swapin from proceeding with the swap cache flag. - * - * Of course there is another possible concurrent scenario as well, - * that is to say, the swap cache flag of a large folio has already - * been set by swapcache_prepare(), while another thread may have - * already split the large swap entry stored in the shmem mapping. - * In this case, shmem_add_to_page_cache() will help identify the - * concurrent swapin and return -EEXIST. - */ - if (swapcache_prepare(entry, nr_pages)) { + swapcache = swapin_folio(entry, new); + if (swapcache != new) { folio_put(new); - new = ERR_PTR(-EEXIST); - /* Try smaller folio to avoid cache conflict */ - goto fallback; + if (!swapcache) { + /* + * The new folio is charged already, swapin can + * only fail due to another raced swapin. + */ + new = ERR_PTR(-EEXIST); + goto fallback; + } } - - __folio_set_locked(new); - __folio_set_swapbacked(new); - new->swap = entry; - - memcg1_swapin(entry, nr_pages); - shadow = swap_cache_get_shadow(entry); - if (shadow) - workingset_refault(new, shadow); - folio_add_lru(new); - swap_read_folio(new, NULL); - return new; + return swapcache; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) @@ -2191,8 +2186,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap, - bool skip_swapcache) + struct folio *folio, swp_entry_t swap) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2208,15 +2202,14 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - if (!skip_swapcache) - swap_cache_del_folio(folio); + folio_put_swap(folio, NULL); + swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); - swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, @@ -2309,7 +2302,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; - bool skip_swapcache = false; int error, nr_pages, order; pgoff_t offset; @@ -2352,7 +2344,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = NULL; goto failed; } - skip_swapcache = true; } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); @@ -2408,9 +2399,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * and swap cache folios are never partially freed. */ folio_lock(folio); - if ((!skip_swapcache && !folio_test_swapcache(folio)) || - shmem_confirm_swap(mapping, index, swap) < 0 || - folio->swap.val != swap.val) { + if (!folio_matches_swap_entry(folio, swap) || + shmem_confirm_swap(mapping, index, swap) < 0) { error = -EEXIST; goto unlock; } @@ -2442,14 +2432,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - if (skip_swapcache) { - folio->swap.val = 0; - swapcache_clear(si, swap, nr_pages); - } else { - swap_cache_del_folio(folio); - } + folio_put_swap(folio, NULL); + swap_cache_del_folio(folio); folio_mark_dirty(folio); - swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; @@ -2458,14 +2443,11 @@ failed: if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap, - skip_swapcache); + shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) folio_unlock(folio); failed_nolock: - if (skip_swapcache) - swapcache_clear(si, folio->swap, folio_nr_pages(folio)); if (folio) folio_put(folio); put_swap_device(si); diff --git a/mm/show_mem.c b/mm/show_mem.c index 3a4b5207635d..24078ac3e6bc 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,8 +278,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(atomic_read(&pgdat->kswapd_failures) >= - MAX_RECLAIM_RETRIES), + str_yes_no(kswapd_test_hopeless(pgdat)), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 20eaee3e97f7..affa64437302 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -70,7 +70,7 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) memcg_aware ? memcg : NULL, count_per_node); if (total) { - seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + seq_printf(m, "%llu", mem_cgroup_id(memcg)); for_each_node(nid) seq_printf(m, " %lu", count_per_node[nid]); seq_putc(m, '\n'); @@ -106,7 +106,8 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, size_t size, loff_t *pos) { struct shrinker *shrinker = file->private_data; - unsigned long nr_to_scan = 0, ino, read_len; + unsigned long nr_to_scan = 0, read_len; + u64 id; struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; @@ -119,7 +120,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EFAULT; kbuf[read_len] = '\0'; - if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3) + if (sscanf(kbuf, "%llu %d %lu", &id, &nid, &nr_to_scan) != 3) return -EINVAL; if (nid < 0 || nid >= nr_node_ids) @@ -129,15 +130,15 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return size; if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - memcg = mem_cgroup_get_from_ino(ino); - if (!memcg || IS_ERR(memcg)) + memcg = mem_cgroup_get_from_id(id); + if (!memcg) return -ENOENT; if (!mem_cgroup_online(memcg)) { mem_cgroup_put(memcg); return -ENOENT; } - } else if (ino != 0) { + } else if (id != 0) { return -EINVAL; } diff --git a/mm/slub.c b/mm/slub.c index 18899017512c..42df791279d9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -8334,7 +8334,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { - flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); WARN_ON(!flushwq); } diff --git a/mm/swap.c b/mm/swap.c index 2260dcd2775e..bb19ccbece46 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -513,7 +513,7 @@ void folio_add_lru(struct folio *folio) EXPORT_SYMBOL(folio_add_lru); /** - * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * folio_add_lru_vma() - Add a folio to the appropriate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * diff --git a/mm/swap.h b/mm/swap.h index 1bd466da3039..bfafa637c458 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -183,6 +183,33 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +/* + * Below are the core routines for doing swap for a folio. + * All helpers requires the folio to be locked, and a locked folio + * in the swap cache pins the swap entries / slots allocated to the + * folio, swap relies heavily on the swap cache and folio lock for + * synchronization. + * + * folio_alloc_swap(): the entry point for a folio to be swapped + * out. It allocates swap slots and pins the slots with swap cache. + * The slots start with a swap count of zero. + * + * folio_dup_swap(): increases the swap count of a folio, usually + * during it gets unmapped and a swap entry is installed to replace + * it (e.g., swap entry in page table). A swap slot with swap + * count == 0 should only be increasd by this helper. + * + * folio_put_swap(): does the opposite thing of folio_dup_swap(). + */ +int folio_alloc_swap(struct folio *folio); +int folio_dup_swap(struct folio *folio, struct page *subpage); +void folio_put_swap(struct folio *folio, struct page *subpage); + +/* For internal use */ +extern void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages); + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -236,7 +263,7 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, /* * All swap cache helpers below require the caller to ensure the swap entries - * used are valid and stablize the device by any of the following ways: + * used are valid and stabilize the device by any of the following ways: * - Hold a reference by get_swap_device(): this ensures a single entry is * valid and increases the swap device's refcount. * - Locking a folio in the swap cache: this ensures the folio's swap entries @@ -245,11 +272,16 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, * swap entries in the page table, similar to locking swap cache folio. * - See the comment of get_swap_device() for more complex usage. */ +bool swap_cache_has_folio(swp_entry_t entry); struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); void swap_cache_del_folio(struct folio *folio); +struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_flags, + struct mempolicy *mpol, pgoff_t ilx, + bool *alloced); /* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry); void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow); void __swap_cache_replace_folio(struct swap_cluster_info *ci, @@ -261,13 +293,11 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); -struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, - struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, - bool skip_if_exists); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); @@ -303,8 +333,6 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - pgoff_t offset = swp_offset(entry); int i; /* @@ -313,8 +341,9 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) * be in conflict with the folio in swap cache. */ for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + if (swap_cache_has_folio(entry)) return i; + entry.val++; } return i; @@ -353,9 +382,24 @@ static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) return NULL; } +static inline int folio_alloc_swap(struct folio *folio) +{ + return -EINVAL; +} + +static inline int folio_dup_swap(struct folio *folio, struct page *page) +{ + return -EINVAL; +} + +static inline void folio_put_swap(struct folio *folio, struct page *page) +{ +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } + static inline void swap_write_unplug(struct swap_iocb *sio) { } @@ -386,6 +430,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +{ + return NULL; +} + static inline void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { @@ -397,8 +446,9 @@ static inline int swap_writeout(struct folio *folio, return 0; } -static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) +static inline bool swap_cache_has_folio(swp_entry_t entry) { + return false; } static inline struct folio *swap_cache_get_folio(swp_entry_t entry) @@ -411,10 +461,6 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) -{ -} - static inline void swap_cache_del_folio(struct folio *folio) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 44d228982521..6d0eef7470be 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -81,7 +81,7 @@ void show_swap_cache_info(void) * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller - * must lock nd check if the folio still matches the swap entry before + * must lock and check if the folio still matches the swap entry before * use (e.g., folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) @@ -103,6 +103,22 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) } /** + * swap_cache_has_folio - Check if a swap slot has cache. + * @entry: swap entry indicating the slot. + * + * Context: Caller must ensure @entry is valid and protect the swap + * device with reference count or locks. + */ +bool swap_cache_has_folio(swp_entry_t entry) +{ + unsigned long swp_tb; + + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + return swp_tb_is_folio(swp_tb); +} + +/** * swap_cache_get_shadow - Looks up a shadow in the swap cache. * @entry: swap entry used for the lookup. * @@ -121,6 +137,34 @@ void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } +void __swap_cache_add_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry) +{ + unsigned long new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_off = ci_start; + ci_end = ci_start + nr_pages; + do { + VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); + __swap_table_set(ci, ci_off, new_tb); + } while (++ci_off < ci_end); + + folio_ref_add(folio, nr_pages); + folio_set_swapcache(folio); + folio->swap = entry; + + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); +} + /** * swap_cache_add_folio - Add a folio into the swap cache. * @folio: The folio to be added. @@ -130,43 +174,51 @@ void *swap_cache_get_shadow(swp_entry_t entry) * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to update the corresponding swap_map slots with - * SWAP_HAS_CACHE bit to avoid race or conflict. */ -void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + void **shadowp) { + int err; void *shadow = NULL; - unsigned long old_tb, new_tb; + unsigned long old_tb; + struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_start, ci_off, ci_end, offset; unsigned long nr_pages = folio_nr_pages(folio); - VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); - VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); - VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - - new_tb = folio_to_swp_tb(folio); + si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + offset = swp_offset(entry); + ci = swap_cluster_lock(si, swp_offset(entry)); + if (unlikely(!ci->table)) { + err = -ENOENT; + goto failed; + } do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); - WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + old_tb = __swap_table_get(ci, ci_off); + if (unlikely(swp_tb_is_folio(old_tb))) { + err = -EEXIST; + goto failed; + } + if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + err = -ENOENT; + goto failed; + } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); + offset++; } while (++ci_off < ci_end); - - folio_ref_add(folio, nr_pages); - folio_set_swapcache(folio); - folio->swap = entry; + __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); - - node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); - lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - if (shadowp) *shadowp = shadow; + return 0; + +failed: + swap_cluster_unlock(ci); + return err; } /** @@ -185,8 +237,10 @@ void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + struct swap_info_struct *si; unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; + bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); @@ -194,6 +248,7 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + si = __swap_entry_to_info(entry); new_tb = shadow_swp_to_tb(shadow); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; @@ -203,12 +258,27 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, old_tb = __swap_table_xchg(ci, ci_off, new_tb); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); + if (__swap_count(swp_entry(si->type, + swp_offset(entry) + ci_off - ci_start))) + folio_swapped = true; + else + need_free = true; } while (++ci_off < ci_end); folio->swap.val = 0; folio_clear_swapcache(folio); node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); + + if (!folio_swapped) { + swap_entries_free(si, ci, swp_offset(entry), nr_pages); + } else if (need_free) { + do { + if (!__swap_count(entry)) + swap_entries_free(si, ci, swp_offset(entry), 1); + entry.val++; + } while (--nr_pages); + } } /** @@ -230,7 +300,6 @@ void swap_cache_del_folio(struct folio *folio) __swap_cache_del_folio(ci, folio, entry, NULL); swap_cluster_unlock(ci); - put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -283,7 +352,7 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, } /** - * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. + * __swap_cache_clear_shadow - Clears a set of shadows in the swap cache. * @entry: The starting index entry. * @nr_ents: How many slots need to be cleared. * @@ -401,108 +470,143 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, } } -struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, - bool skip_if_exists) +/** + * __swap_cache_prepare_and_add - Prepare the folio and add it to swap cache. + * @entry: swap entry to be bound to the folio. + * @folio: folio to be added. + * @gfp: memory allocation flags for charge, can be 0 if @charged if true. + * @charged: if the folio is already charged. + * + * Update the swap_map and add folio as swap cache, typically before swapin. + * All swap slots covered by the folio must have a non-zero swap count. + * + * Context: Caller must protect the swap device with reference count or locks. + * Return: Returns the folio being added on success. Returns the existing folio + * if @entry is already cached. Returns NULL if raced with swapin or swapoff. + */ +static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, + struct folio *folio, + gfp_t gfp, bool charged) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - struct folio *folio; - struct folio *new_folio = NULL; - struct folio *result = NULL; - void *shadow = NULL; + struct folio *swapcache = NULL; + void *shadow; + int ret; - *new_page_allocated = false; + __folio_set_locked(folio); + __folio_set_swapbacked(folio); for (;;) { - int err; + ret = swap_cache_add_folio(folio, entry, &shadow); + if (!ret) + break; /* - * Check the swap cache first, if a cached folio is found, - * return it unlocked. The caller will lock and check it. + * Large order allocation needs special handling on + * race: if a smaller folio exists in cache, swapin needs + * to fallback to order 0, and doing a swap cache lookup + * might return a folio that is irrelevant to the faulting + * entry because @entry is aligned down. Just return NULL. */ - folio = swap_cache_get_folio(entry); - if (folio) - goto got_folio; + if (ret != -EEXIST || folio_test_large(folio)) + goto failed; - /* - * Just skip read ahead for unused swap slot. - */ - if (!swap_entry_swapped(si, entry)) - goto put_and_return; + swapcache = swap_cache_get_folio(entry); + if (swapcache) + goto failed; + } - /* - * Get a new folio to read into from swap. Allocate it now if - * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, - * when -EEXIST will cause any racers to loop around until we - * add it to cache. - */ - if (!new_folio) { - new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!new_folio) - goto put_and_return; - } + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { + swap_cache_del_folio(folio); + goto failed; + } - /* - * Swap entry may have been freed since our caller observed it. - */ - err = swapcache_prepare(entry, 1); - if (!err) - break; - else if (err != -EEXIST) - goto put_and_return; + memcg1_swapin(entry, folio_nr_pages(folio)); + if (shadow) + workingset_refault(folio, shadow); - /* - * Protect against a recursive call to __read_swap_cache_async() - * on the same entry waiting forever here because SWAP_HAS_CACHE - * is set but the folio is not the swap cache yet. This can - * happen today if mem_cgroup_swapin_charge_folio() below - * triggers reclaim through zswap, which may call - * __read_swap_cache_async() in the writeback path. - */ - if (skip_if_exists) - goto put_and_return; + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); + return folio; - /* - * We might race against __swap_cache_del_folio(), and - * stumble across a swap_map entry whose SWAP_HAS_CACHE - * has not yet been cleared. Or race against another - * __read_swap_cache_async(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its folio to swap cache. - */ - schedule_timeout_uninterruptible(1); - } +failed: + folio_unlock(folio); + return swapcache; +} - /* - * The swap entry is ours to swap in. Prepare the new folio. - */ - __folio_set_locked(new_folio); - __folio_set_swapbacked(new_folio); +/** + * swap_cache_alloc_folio - Allocate folio for swapped out slot in swap cache. + * @entry: the swapped out swap entry to be binded to the folio. + * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE + * @new_page_allocated: sets true if allocation happened, false otherwise + * + * Allocate a folio in the swap cache for one swap slot, typically before + * doing IO (e.g. swap in or zswap writeback). The swap slot indicated by + * @entry must have a non-zero swap count (swapped out). + * Currently only supports order 0. + * + * Context: Caller must protect the swap device with reference count or locks. + * Return: Returns the existing folio if @entry is cached already. Returns + * NULL if failed due to -ENOMEM or @entry have a swap count < 1. + */ +struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) +{ + struct swap_info_struct *si = __swap_entry_to_info(entry); + struct folio *folio; + struct folio *result = NULL; - if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) - goto fail_unlock; + *new_page_allocated = false; + /* Check the swap cache again for readahead path. */ + folio = swap_cache_get_folio(entry); + if (folio) + return folio; - swap_cache_add_folio(new_folio, entry, &shadow); - memcg1_swapin(entry, 1); + /* Skip allocation for unused and bad swap slot for readahead. */ + if (!swap_entry_swapped(si, entry)) + return NULL; - if (shadow) - workingset_refault(new_folio, shadow); - - /* Caller will initiate read into locked new_folio */ - folio_add_lru(new_folio); - *new_page_allocated = true; - folio = new_folio; -got_folio: - result = folio; - goto put_and_return; - -fail_unlock: - put_swap_folio(new_folio, entry); - folio_unlock(new_folio); -put_and_return: - if (!(*new_page_allocated) && new_folio) - folio_put(new_folio); + /* Allocate a new folio to be added into the swap cache. */ + folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!folio) + return NULL; + /* Try add the new folio, returns existing folio or NULL on failure. */ + result = __swap_cache_prepare_and_add(entry, folio, gfp_mask, false); + if (result == folio) + *new_page_allocated = true; + else + folio_put(folio); return result; } +/** + * swapin_folio - swap-in one or multiple entries skipping readahead. + * @entry: starting swap entry to swap in + * @folio: a new allocated and charged folio + * + * Reads @entry into @folio, @folio will be added to the swap cache. + * If @folio is a large folio, the @entry will be rounded down to align + * with the folio size. + * + * Return: returns pointer to @folio on success. If folio is a large folio + * and this raced with another swapin, NULL will be returned to allow fallback + * to order 0. Else, if another folio was already added to the swap cache, + * return that swap cache folio instead. + */ +struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +{ + struct folio *swapcache; + pgoff_t offset = swp_offset(entry); + unsigned long nr_pages = folio_nr_pages(folio); + + entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true); + if (swapcache == folio) + swap_read_folio(folio, NULL); + return swapcache; +} + /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. @@ -524,8 +628,8 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return NULL; mpol = get_vma_policy(vma, addr, 0, &ilx); - folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, + &page_allocated); mpol_cond_put(mpol); if (page_allocated) @@ -642,9 +746,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - folio = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, mpol, ilx, &page_allocated, false); + folio = swap_cache_alloc_folio( + swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, + &page_allocated); if (!folio) continue; if (page_allocated) { @@ -661,8 +765,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, + &page_allocated); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; @@ -766,8 +870,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!si) continue; } - folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx, + &page_allocated); if (si) put_swap_device(si); if (!folio) @@ -788,8 +892,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, lru_add_drain(); skip: /* The folio was likely read above, so no need for plugging here */ - folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated, false); + folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); if (unlikely(page_allocated)) swap_read_folio(folio, NULL); return folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 25120cf7c480..c2377c4b6bb9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,16 +48,18 @@ #include <linux/swap_cgroup.h> #include "swap_table.h" #include "internal.h" +#include "swap_table.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); +static void swap_put_entry_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -81,9 +83,7 @@ bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; -static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; -static const char Unused_offset[] = "Unused swap offset entry "; /* * all active swap_info_structs @@ -144,11 +144,6 @@ static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) return swap_type_to_info(swp_type(entry)); } -static inline unsigned char swap_count(unsigned char ent) -{ - return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ -} - /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can @@ -180,39 +175,25 @@ static long swap_usage_in_pages(struct swap_info_struct *si) #define TTRS_FULL 0x4 static bool swap_only_has_cache(struct swap_info_struct *si, - unsigned long offset, int nr_pages) + struct swap_cluster_info *ci, + unsigned long offset, int nr_pages) { + unsigned int ci_off = offset % SWAPFILE_CLUSTER; unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; + unsigned long swp_tb; do { - VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); - if (*map != SWAP_HAS_CACHE) + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); + if (*map) return false; + ++ci_off; } while (++map < map_end); return true; } -static bool swap_is_last_map(struct swap_info_struct *si, - unsigned long offset, int nr_pages, bool *has_cache) -{ - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; - unsigned char count = *map; - - if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) - return false; - - while (++map < map_end) { - if (*map != count) - return false; - } - - *has_cache = !!(count & SWAP_HAS_CACHE); - return true; -} - /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no @@ -262,12 +243,12 @@ again: goto out_unlock; /* - * It's safe to delete the folio from swap cache only if the folio's - * swap_map is HAS_CACHE only, which means the slots have no page table + * It's safe to delete the folio from swap cache only if the folio + * is in swap cache with swap count == 0. The slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, offset, nr_pages); + need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -777,68 +758,84 @@ static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, return 0; } +/* + * Reclaim drops the ci lock, so the cluster may become unusable (freed or + * stolen by a lower order). @usable will be set to false if that happens. + */ static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned long end) + unsigned long start, unsigned int order, + bool *usable) { + unsigned int nr_pages = 1 << order; + unsigned long offset = start, end = start + nr_pages; unsigned char *map = si->swap_map; - unsigned long offset = start; - int nr_reclaim; + unsigned long swp_tb; spin_unlock(&ci->lock); do { - switch (READ_ONCE(map[offset])) { - case 0: - offset++; - break; - case SWAP_HAS_CACHE: - nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); - if (nr_reclaim > 0) - offset += nr_reclaim; - else - goto out; + if (READ_ONCE(map[offset])) break; - default: - goto out; + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb)) { + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) + break; } - } while (offset < end); -out: + } while (++offset < end); spin_lock(&ci->lock); + + /* + * We just dropped ci->lock so cluster could be used by another + * order or got freed, check if it's still usable or empty. + */ + if (!cluster_is_usable(ci, order)) { + *usable = false; + return false; + } + *usable = true; + + /* Fast path, no need to scan if the whole cluster is empty */ + if (cluster_is_empty(ci)) + return true; + /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. */ - for (offset = start; offset < end; offset++) - if (READ_ONCE(map[offset])) + for (offset = start; offset < end; offset++) { + swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (map[offset] || !swp_tb_is_null(swp_tb)) return false; + } return true; } static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages, + unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { - unsigned long offset, end = start + nr_pages; + unsigned long end = offset + nr_pages; unsigned char *map = si->swap_map; + unsigned long swp_tb; if (cluster_is_empty(ci)) return true; - for (offset = start; offset < end; offset++) { - switch (READ_ONCE(map[offset])) { - case 0: - continue; - case SWAP_HAS_CACHE: + do { + if (map[offset]) + return false; + swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; - continue; - default: - return false; + } else { + /* A entry with no count and no cache must be null */ + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); } - } + } while (++offset < end); return true; } @@ -863,11 +860,13 @@ static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, } } -static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned int start, unsigned char usage, - unsigned int order) +static bool cluster_alloc_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int offset) { - unsigned int nr_pages = 1 << order; + unsigned long nr_pages; + unsigned int order; lockdep_assert_held(&ci->lock); @@ -875,16 +874,38 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster return false; /* + * All mm swap allocation starts with a folio (folio_alloc_swap), + * it's also the only allocation path for large orders allocation. + * Such swap slots starts with count == 0 and will be increased + * upon folio unmap. + * + * Else, it's a exclusive order 0 allocation for hibernation. + * The slot starts with count == 1 and never increases. + */ + if (likely(folio)) { + order = folio_order(folio); + nr_pages = 1 << order; + __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + } else if (IS_ENABLED(CONFIG_HIBERNATION)) { + order = 0; + nr_pages = 1; + WARN_ON_ONCE(si->swap_map[offset]); + si->swap_map[offset] = 1; + swap_cluster_assert_table_empty(ci, offset, 1); + } else { + /* Allocation without folio is only possible with hibernation */ + WARN_ON_ONCE(1); + return false; + } + + /* * The first allocation in a cluster makes the * cluster exclusive to this order */ if (cluster_is_empty(ci)) ci->order = order; - - memset(si->swap_map + start, usage, nr_pages); - swap_cluster_assert_table_empty(ci, start, nr_pages); - swap_range_alloc(si, nr_pages); ci->count += nr_pages; + swap_range_alloc(si, nr_pages); return true; } @@ -892,17 +913,17 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster /* Try use a new cluster for current CPU and allocate from it. */ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long offset, - unsigned int order, - unsigned char usage) + struct folio *folio, unsigned long offset) { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int nr_pages = 1 << order; - bool need_reclaim, ret; + bool need_reclaim, ret, usable; lockdep_assert_held(&ci->lock); + VM_WARN_ON(!cluster_is_usable(ci, order)); if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) goto out; @@ -912,14 +933,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) { - ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); - /* - * Reclaim drops ci->lock and cluster could be used - * by another order. Not checking flag as off-list - * cluster has no flag set, and change of list - * won't cause fragmentation. - */ - if (!cluster_is_usable(ci, order)) + ret = cluster_reclaim_range(si, ci, offset, order, &usable); + if (!usable) goto out; if (cluster_is_empty(ci)) offset = start; @@ -927,7 +942,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!ret) continue; } - if (!cluster_alloc_range(si, ci, offset, usage, order)) + if (!cluster_alloc_range(si, ci, folio, offset)) break; found = offset; offset += nr_pages; @@ -949,8 +964,7 @@ out: static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, struct list_head *list, - unsigned int order, - unsigned char usage, + struct folio *folio, bool scan_all) { unsigned int found = SWAP_ENTRY_INVALID; @@ -962,7 +976,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, if (!ci) break; offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + found = alloc_swap_scan_cluster(si, ci, folio, offset); if (found) break; } while (scan_all); @@ -987,7 +1001,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + if (!READ_ONCE(map[offset]) && + swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1023,10 +1038,11 @@ static void swap_reclaim_work(struct work_struct *work) * Try to allocate swap entries with specified order and try set a new * cluster for current CPU too. */ -static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, - unsigned char usage) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, + struct folio *folio) { struct swap_cluster_info *ci; + unsigned int order = likely(folio) ? folio_order(folio) : 0; unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; /* @@ -1048,8 +1064,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, - order, usage); + found = alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } @@ -1063,22 +1078,19 @@ new_cluster: * to spread out the writes. */ if (si->flags & SWP_PAGE_DISCARD) { - found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, - false); + found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } if (order < PMD_ORDER) { - found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], - order, usage, true); + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], folio, true); if (found) goto done; } if (!(si->flags & SWP_PAGE_DISCARD)) { - found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, - false); + found = alloc_swap_scan_list(si, &si->free_clusters, folio, false); if (found) goto done; } @@ -1092,10 +1104,9 @@ new_cluster: * Scan only one fragment cluster is good enough. Order 0 * allocation will surely success, and large allocation * failure is not critical. Scanning one cluster still - * keeps the list rotated and reclaimed (for HAS_CACHE). + * keeps the list rotated and reclaimed (for clean swap cache). */ - found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, - usage, false); + found = alloc_swap_scan_list(si, &si->frag_clusters[order], folio, false); if (found) goto done; } @@ -1109,13 +1120,11 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - found = alloc_swap_scan_list(si, &si->frag_clusters[o], - 0, usage, true); + found = alloc_swap_scan_list(si, &si->frag_clusters[o], folio, true); if (found) goto done; - found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], - 0, usage, true); + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], folio, true); if (found) goto done; } @@ -1306,12 +1315,12 @@ static bool get_swap_device_info(struct swap_info_struct *si) * Fast path try to get swap entries with specified order from current * CPU's swap entry pool (a cluster). */ -static bool swap_alloc_fast(swp_entry_t *entry, - int order) +static bool swap_alloc_fast(struct folio *folio) { + unsigned int order = folio_order(folio); struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned int offset, found = SWAP_ENTRY_INVALID; + unsigned int offset; /* * Once allocated, swap_info_struct will never be completely freed, @@ -1326,22 +1335,18 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); - found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE); - if (found) - *entry = swp_entry(si->type, found); + alloc_swap_scan_cluster(si, ci, folio, offset); } else { swap_cluster_unlock(ci); } put_swap_device(si); - return !!found; + return folio_test_swapcache(folio); } /* Rotate the device and switch to a new cluster */ -static void swap_alloc_slow(swp_entry_t *entry, - int order) +static void swap_alloc_slow(struct folio *folio) { - unsigned long offset; struct swap_info_struct *si, *next; spin_lock(&swap_avail_lock); @@ -1351,13 +1356,11 @@ start_over: plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { - offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); + cluster_alloc_swap_entry(si, folio); put_swap_device(si); - if (offset) { - *entry = swp_entry(si->type, offset); + if (folio_test_swapcache(folio)) return; - } - if (order) + if (folio_test_large(folio)) return; } @@ -1409,6 +1412,75 @@ start_over: } /** + * swap_put_entries_cluster - Decrease the swap count of a set of slots. + * @si: The swap device. + * @start: start offset of slots. + * @nr: number of slots. + * @reclaim_cache: if true, also reclaim the swap cache. + * + * This helper decreases the swap count of a set of slots and tries to + * batch free them. Also reclaims the swap cache if @reclaim_cache is true. + * Context: The caller must ensure that all slots belong to the same + * cluster and their swap count doesn't go underflow. + */ +static void swap_put_entries_cluster(struct swap_info_struct *si, + unsigned long start, int nr, + bool reclaim_cache) +{ + unsigned long offset = start, end = start + nr; + unsigned long batch_start = SWAP_ENTRY_INVALID; + struct swap_cluster_info *ci; + bool need_reclaim = false; + unsigned int nr_reclaimed; + unsigned long swp_tb; + unsigned int count; + + ci = swap_cluster_lock(si, offset); + do { + swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); + count = si->swap_map[offset]; + VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); + if (count == 1) { + /* count == 1 and non-cached slots will be batch freed. */ + if (!swp_tb_is_folio(swp_tb)) { + if (!batch_start) + batch_start = offset; + continue; + } + /* count will be 0 after put, slot can be reclaimed */ + need_reclaim = true; + } + /* + * A count != 1 or cached slot can't be freed. Put its swap + * count and then free the interrupted pending batch. Cached + * slots will be freed when folio is removed from swap cache + * (__swap_cache_del_folio). + */ + swap_put_entry_locked(si, ci, offset); + if (batch_start) { + swap_entries_free(si, ci, batch_start, offset - batch_start); + batch_start = SWAP_ENTRY_INVALID; + } + } while (++offset < end); + + if (batch_start) + swap_entries_free(si, ci, batch_start, offset - batch_start); + swap_cluster_unlock(ci); + + if (!need_reclaim || !reclaim_cache) + return; + + offset = start; + do { + nr_reclaimed = __try_to_reclaim_swap(si, offset, + TTRS_UNMAPPED | TTRS_FULL); + offset++; + if (nr_reclaimed) + offset = round_up(offset, abs(nr_reclaimed)); + } while (offset < end); +} + +/** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap * @@ -1422,7 +1494,6 @@ int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; - swp_entry_t entry = {}; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); @@ -1447,89 +1518,94 @@ int folio_alloc_swap(struct folio *folio) again: local_lock(&percpu_swap_cluster.lock); - if (!swap_alloc_fast(&entry, order)) - swap_alloc_slow(&entry, order); + if (!swap_alloc_fast(folio)) + swap_alloc_slow(folio); local_unlock(&percpu_swap_cluster.lock); - if (unlikely(!order && !entry.val)) { + if (!order && unlikely(!folio_test_swapcache(folio))) { if (swap_sync_discard()) goto again; } /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ - if (mem_cgroup_try_charge_swap(folio, entry)) - goto out_free; + if (unlikely(mem_cgroup_try_charge_swap(folio, folio->swap))) + swap_cache_del_folio(folio); - if (!entry.val) + if (unlikely(!folio_test_swapcache(folio))) return -ENOMEM; - swap_cache_add_folio(folio, entry, NULL); - return 0; +} + +/** + * folio_dup_swap() - Increase swap count of swap entries of a folio. + * @folio: folio with swap entries bounded. + * @subpage: if not NULL, only increase the swap count of this subpage. + * + * Typically called when the folio is unmapped and have its swap entry to + * take its palce. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * NOTE: The caller also has to ensure there is no raced call to + * swap_put_entries_direct on its swap entry before this helper returns, or + * the swap map may underflow. Currently, we only accept @subpage == NULL + * for shmem due to the limitation of swap continuation: shmem always + * duplicates the swap entry only once, so there is no such issue for it. + */ +int folio_dup_swap(struct folio *folio, struct page *subpage) +{ + int err = 0; + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); -out_free: - put_swap_folio(folio, entry); - return -ENOMEM; + return err; } -static struct swap_info_struct *_swap_info_get(swp_entry_t entry) +/** + * folio_put_swap() - Decrease swap count of swap entries of a folio. + * @folio: folio with swap entries bounded, must be in swap cache and locked. + * @subpage: if not NULL, only decrease the swap count of this subpage. + * + * This won't free the swap slots even if swap count drops to zero, they are + * still pinned by the swap cache. User may call folio_free_swap to free them. + * Context: Caller must ensure the folio is locked and in the swap cache. + */ +void folio_put_swap(struct folio *folio, struct page *subpage) { - struct swap_info_struct *si; - unsigned long offset; + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + struct swap_info_struct *si = __swap_entry_to_info(entry); - if (!entry.val) - goto out; - si = swap_entry_to_info(entry); - if (!si) - goto bad_nofile; - if (data_race(!(si->flags & SWP_USED))) - goto bad_device; - offset = swp_offset(entry); - if (offset >= si->max) - goto bad_offset; - if (data_race(!si->swap_map[swp_offset(entry)])) - goto bad_free; - return si; + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); -bad_free: - pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); - goto out; -bad_offset: - pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); - goto out; -bad_device: - pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); - goto out; -bad_nofile: - pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); -out: - return NULL; + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } -static unsigned char swap_entry_put_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, - unsigned char usage) +static void swap_put_entry_locked(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset) { - unsigned long offset = swp_offset(entry); unsigned char count; - unsigned char has_cache; count = si->swap_map[offset]; - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) { - VM_BUG_ON(!has_cache); - has_cache = 0; - } else if (count == SWAP_MAP_SHMEM) { - /* - * Or we could insist on shmem.c using a special - * swap_shmem_free() and free_shmem_swap_and_cache()... - */ - count = 0; - } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; @@ -1539,13 +1615,9 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, count--; } - usage = count | has_cache; - if (usage) - WRITE_ONCE(si->swap_map[offset], usage); - else - swap_entries_free(si, ci, entry, 1); - - return usage; + WRITE_ONCE(si->swap_map[offset], count); + if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) + swap_entries_free(si, ci, offset, 1); } /* @@ -1574,10 +1646,9 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon - * __read_swap_cache_async() - * swapcache_prepare() - * __swap_duplicate() - * // check swap_map + * swap_cache_alloc_folio() + * swap_cache_add_folio() + * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before @@ -1614,105 +1685,15 @@ put_out: return NULL; } -static void swap_entries_put_cache(struct swap_info_struct *si, - swp_entry_t entry, int nr) -{ - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; - - ci = swap_cluster_lock(si, offset); - if (swap_only_has_cache(si, offset, nr)) { - swap_entries_free(si, ci, entry, nr); - } else { - for (int i = 0; i < nr; i++, entry.val++) - swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); - } - swap_cluster_unlock(ci); -} - -static bool swap_entries_put_map(struct swap_info_struct *si, - swp_entry_t entry, int nr) -{ - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; - bool has_cache = false; - unsigned char count; - int i; - - if (nr <= 1) - goto fallback; - count = swap_count(data_race(si->swap_map[offset])); - if (count != 1 && count != SWAP_MAP_SHMEM) - goto fallback; - - ci = swap_cluster_lock(si, offset); - if (!swap_is_last_map(si, offset, nr, &has_cache)) { - goto locked_fallback; - } - if (!has_cache) - swap_entries_free(si, ci, entry, nr); - else - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - swap_cluster_unlock(ci); - - return has_cache; - -fallback: - ci = swap_cluster_lock(si, offset); -locked_fallback: - for (i = 0; i < nr; i++, entry.val++) { - count = swap_entry_put_locked(si, ci, entry, 1); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } - swap_cluster_unlock(ci); - return has_cache; -} - -/* - * Only functions with "_nr" suffix are able to free entries spanning - * cross multi clusters, so ensure the range is within a single cluster - * when freeing entries with functions without "_nr" suffix. - */ -static bool swap_entries_put_map_nr(struct swap_info_struct *si, - swp_entry_t entry, int nr) -{ - int cluster_nr, cluster_rest; - unsigned long offset = swp_offset(entry); - bool has_cache = false; - - cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; - while (nr) { - cluster_nr = min(nr, cluster_rest); - has_cache |= swap_entries_put_map(si, entry, cluster_nr); - cluster_rest = SWAPFILE_CLUSTER; - nr -= cluster_nr; - entry.val += cluster_nr; - } - - return has_cache; -} - -/* - * Check if it's the last ref of swap entry in the freeing path. - * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. - */ -static inline bool __maybe_unused swap_is_last_ref(unsigned char count) -{ - return (count == SWAP_HAS_CACHE) || (count == 1) || - (count == SWAP_MAP_SHMEM); -} - /* * Drop the last ref of swap entries, caller have to ensure all entries * belong to the same cgroup and cluster. */ -static void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, unsigned int nr_pages) { - unsigned long offset = swp_offset(entry); + swp_entry_t entry = swp_entry(si->type, offset); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; @@ -1723,7 +1704,7 @@ static void swap_entries_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(!swap_is_last_ref(*map)); + VM_WARN_ON(*map > 1); *map = 0; } while (++map < map_end); @@ -1737,55 +1718,18 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -/* - * Caller has made sure that the swap device corresponding to entry - * is still around or has not been recycled. - */ -void swap_free_nr(swp_entry_t entry, int nr_pages) -{ - int nr; - struct swap_info_struct *sis; - unsigned long offset = swp_offset(entry); - - sis = _swap_info_get(entry); - if (!sis) - return; - - while (nr_pages) { - nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); - offset += nr; - nr_pages -= nr; - } -} - -/* - * Called after dropping swapcache to decrease refcnt to swap entries. - */ -void put_swap_folio(struct folio *folio, swp_entry_t entry) -{ - struct swap_info_struct *si; - int size = 1 << swap_entry_order(folio_order(folio)); - - si = _swap_info_get(entry); - if (!si) - return; - - swap_entries_put_cache(si, entry, size); -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); - return swap_count(si->swap_map[offset]); + return si->swap_map[offset]; } -/* - * How many references to @entry are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. +/** + * swap_entry_swapped - Check if the swap entry is swapped. + * @si: the swap device. + * @entry: the swap entry. */ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { @@ -1794,9 +1738,10 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) int count; ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; swap_cluster_unlock(ci); - return !!count; + + return count && count != SWAP_MAP_BAD; } /* @@ -1812,7 +1757,7 @@ int swp_swapcount(swp_entry_t entry) pgoff_t offset; unsigned char *map; - si = _swap_info_get(entry); + si = get_swap_device(entry); if (!si) return 0; @@ -1820,7 +1765,7 @@ int swp_swapcount(swp_entry_t entry) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if (!(count & COUNT_CONTINUED)) goto out; @@ -1842,6 +1787,7 @@ int swp_swapcount(swp_entry_t entry) } while (tmp_count & COUNT_CONTINUED); out: swap_cluster_unlock(ci); + put_swap_device(si); return count; } @@ -1858,12 +1804,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (swap_count(map[roffset])) + if (map[roffset]) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (swap_count(map[offset + i])) { + if (map[offset + i]) { ret = true; break; } @@ -1876,11 +1822,12 @@ unlock_out: static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; - struct swap_info_struct *si = _swap_info_get(entry); + struct swap_info_struct *si; - if (!si) - return false; + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + si = __swap_entry_to_info(entry); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_entry_swapped(si, entry); @@ -1939,73 +1886,45 @@ bool folio_free_swap(struct folio *folio) } /** - * free_swap_and_cache_nr() - Release reference on range of swap entries and - * reclaim their cache if no more references remain. + * swap_put_entries_direct() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being released. */ -void free_swap_and_cache_nr(swp_entry_t entry, int nr) +void swap_put_entries_direct(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; + unsigned long offset, cluster_end; struct swap_info_struct *si; - bool any_only_cache = false; - unsigned long offset; si = get_swap_device(entry); - if (!si) + if (WARN_ON_ONCE(!si)) return; - - if (WARN_ON(end_offset > si->max)) - goto out; - - /* - * First free all entries in the range. - */ - any_only_cache = swap_entries_put_map_nr(si, entry, nr); - - /* - * Short-circuit the below loop if none of the entries had their - * reference drop to zero. - */ - if (!any_only_cache) + if (WARN_ON_ONCE(end_offset > si->max)) goto out; - /* - * Now go back over the range trying to reclaim the swap cache. - */ - for (offset = start_offset; offset < end_offset; offset += nr) { - nr = 1; - if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - /* - * Folios are always naturally aligned in swap so - * advance forward to the next boundary. Zero means no - * folio was found for the swap entry, so advance by 1 - * in this case. Negative value means folio was found - * but could not be reclaimed. Here we can still advance - * to the next boundary. - */ - nr = __try_to_reclaim_swap(si, offset, - TTRS_UNMAPPED | TTRS_FULL); - if (nr == 0) - nr = 1; - else if (nr < 0) - nr = -nr; - nr = ALIGN(offset + 1, nr) - offset; - } - } - + /* Put entries and reclaim cache in each cluster */ + offset = start_offset; + do { + cluster_end = min(round_up(offset + 1, SWAPFILE_CLUSTER), end_offset); + swap_put_entries_cluster(si, offset, cluster_end - offset, true); + offset = cluster_end; + } while (offset < end_offset); out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION - -swp_entry_t get_swap_page_of_type(int type) +/* Allocate a slot for hibernation */ +swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; @@ -2018,11 +1937,11 @@ swp_entry_t get_swap_page_of_type(int type) if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { /* - * Grab the local lock to be complaint + * Grab the local lock to be compliant * with swap table allocation. */ local_lock(&percpu_swap_cluster.lock); - offset = cluster_alloc_swap_entry(si, 0, 1); + offset = cluster_alloc_swap_entry(si, NULL); local_unlock(&percpu_swap_cluster.lock); if (offset) entry = swp_entry(si->type, offset); @@ -2033,6 +1952,26 @@ fail: return entry; } +/* Free a slot allocated by swap_alloc_hibernation_slot */ +void swap_free_hibernation_slot(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + pgoff_t offset = swp_offset(entry); + + si = get_swap_device(entry); + if (WARN_ON(!si)) + return; + + ci = swap_cluster_lock(si, offset); + swap_put_entry_locked(si, ci, offset); + swap_cluster_unlock(ci); + + /* In theory readahead might add it to the swap cache by accident */ + __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + put_swap_device(si); +} + /* * Find the swap type that corresponds to given device (if any). * @@ -2194,7 +2133,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); @@ -2235,7 +2174,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); - swap_free(entry); + folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry))); out: if (pte) pte_unmap_unlock(pte, ptl); @@ -2430,6 +2369,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; + unsigned long swp_tb; unsigned char count; /* @@ -2440,7 +2380,11 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); - if (count && swap_count(count) != SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), + i % SWAPFILE_CLUSTER); + if (count == SWAP_MAP_BAD) + continue; + if (count || swp_tb_is_folio(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -3650,67 +3594,39 @@ void si_swapinfo(struct sysinfo *val) * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL - * - swap-cache reference is requested but there is already one. -> EEXIST - * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) +static int swap_dup_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, + unsigned char usage, int nr) { - struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset; + int i; unsigned char count; - unsigned char has_cache; - int err, i; - - si = swap_entry_to_info(entry); - if (WARN_ON_ONCE(!si)) { - pr_err("%s%08lx\n", Bad_file, entry.val); - return -EINVAL; - } - - offset = swp_offset(entry); - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - VM_WARN_ON(usage == 1 && nr > 1); - ci = swap_cluster_lock(si, offset); - err = 0; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - /* - * swapin_readahead() doesn't check if a swap entry is valid, so the - * swap entry could be SWAP_MAP_BAD. Check here with lock held. + * For swapin out, allocator never allocates bad slots. for + * swapin, readahead is guarded by swap_entry_swapped. */ - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { - err = -ENOENT; - goto unlock_out; - } - - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (!count && !has_cache) { - err = -ENOENT; - } else if (usage == SWAP_HAS_CACHE) { - if (has_cache) - err = -EEXIST; - } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { - err = -EINVAL; - } - - if (err) - goto unlock_out; + if (WARN_ON(count == SWAP_MAP_BAD)) + return -ENOENT; + /* + * Swap count duplication must be guarded by either swap cache folio (from + * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). + */ + if (WARN_ON(!count && + !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) + return -ENOENT; + if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) + return -EINVAL; } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; - - if (usage == SWAP_HAS_CACHE) - has_cache = SWAP_HAS_CACHE; - else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; @@ -3719,66 +3635,56 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) * Don't need to rollback changes, because if * usage == 1, there must be nr == 1. */ - err = -ENOMEM; - goto unlock_out; + return -ENOMEM; } - WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count); } -unlock_out: - swap_cluster_unlock(ci); - return err; + return 0; } -/* - * Help swapoff by noting that swap entry belongs to shmem/tmpfs - * (in which case its reference count is never incremented). - */ -void swap_shmem_alloc(swp_entry_t entry, int nr) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { - __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); + int err; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = swap_entry_to_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } + + VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + ci = swap_cluster_lock(si, offset); + err = swap_dup_entries(si, ci, offset, usage, nr); + swap_cluster_unlock(ci); + return err; } /* - * Increase reference count of swap entry by 1. + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. + * @entry: first swap entry from which we want to increase the refcount. + * * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ -int swap_duplicate(swp_entry_t entry) +int swap_dup_entry_direct(swp_entry_t entry) { int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* - * @entry: first swap entry from which we allocate nr swap cache. - * - * Called when allocating swap cache for existing swap entries, - * This can return error codes. Returns 0 at success. - * -EEXIST means there is a swap cache. - * Note: return code is different from swap_duplicate(). - */ -int swapcache_prepare(swp_entry_t entry, int nr) -{ - return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); -} - -/* - * Caller should ensure entries belong to the same folio so - * the entries won't span cross cluster boundary. - */ -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) -{ - swap_entries_put_cache(si, entry, nr); -} - -/* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count @@ -3823,7 +3729,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) ci = swap_cluster_lock(si, offset); - count = swap_count(si->swap_map[offset]); + count = si->swap_map[offset]; if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* @@ -3895,7 +3801,7 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of swap_entry_put_locked() + * Called while __swap_duplicate() or caller of swap_put_entry_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, diff --git a/mm/tests/lazy_mmu_mode_kunit.c b/mm/tests/lazy_mmu_mode_kunit.c new file mode 100644 index 000000000000..b689241c6bef --- /dev/null +++ b/mm/tests/lazy_mmu_mode_kunit.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> +#include <linux/pgtable.h> + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + +static void expect_not_active(struct kunit *test) +{ + KUNIT_EXPECT_FALSE(test, is_lazy_mmu_mode_active()); +} + +static void expect_active(struct kunit *test) +{ + KUNIT_EXPECT_TRUE(test, is_lazy_mmu_mode_active()); +} + +static void lazy_mmu_mode_active(struct kunit *test) +{ + expect_not_active(test); + + lazy_mmu_mode_enable(); + expect_active(test); + + { + /* Nested section */ + lazy_mmu_mode_enable(); + expect_active(test); + + lazy_mmu_mode_disable(); + expect_active(test); + } + + { + /* Paused section */ + lazy_mmu_mode_pause(); + expect_not_active(test); + + { + /* No effect (paused) */ + lazy_mmu_mode_enable(); + expect_not_active(test); + + lazy_mmu_mode_disable(); + expect_not_active(test); + + lazy_mmu_mode_pause(); + expect_not_active(test); + + lazy_mmu_mode_resume(); + expect_not_active(test); + } + + lazy_mmu_mode_resume(); + expect_active(test); + } + + lazy_mmu_mode_disable(); + expect_not_active(test); +} + +static struct kunit_case lazy_mmu_mode_test_cases[] = { + KUNIT_CASE(lazy_mmu_mode_active), + {} +}; + +static struct kunit_suite lazy_mmu_mode_test_suite = { + .name = "lazy_mmu_mode", + .test_cases = lazy_mmu_mode_test_cases, +}; +kunit_test_suite(lazy_mmu_mode_test_suite); + +MODULE_DESCRIPTION("Tests for the lazy MMU mode"); +MODULE_LICENSE("GPL"); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e6dfd5f28acd..927086bb4a3c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1103,7 +1103,7 @@ static long move_present_ptes(struct mm_struct *mm, /* It's safe to drop the reference now as the page-table is holding one. */ folio_put(*first_src_folio); *first_src_folio = NULL; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); while (true) { orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1140,7 +1140,7 @@ static long move_present_ptes(struct mm_struct *mm, break; } - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); if (src_addr > src_start) flush_tlb_range(src_vma, src_start, src_addr); @@ -1190,17 +1190,13 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, * Check if the swap entry is cached after acquiring the src_pte * lock. Otherwise, we might miss a newly loaded swap cache folio. * - * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. * We are trying to catch newly added swap cache, the only possible case is * when a folio is swapped in and out again staying in swap cache, using the * same entry before the PTE check above. The PTL is acquired and released - * twice, each time after updating the swap_map's flag. So holding - * the PTL here ensures we see the updated value. False positive is possible, - * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the - * cache, or during the tiny synchronization window between swap cache and - * swap_map, but it will be gone very quickly, worst result is retry jitters. + * twice, each time after updating the swap table. So holding + * the PTL here ensures we see the updated value. */ - if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + if (swap_cache_has_folio(entry)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1274,7 +1270,7 @@ retry: * Use the maywrite version to indicate that dst_pte will be modified, * since dst_pte needs to be none, the subsequent pte_same() check * cannot prevent the dst_pte page from being freed concurrently, so we - * also need to abtain dst_pmdval and recheck pmd_same() later. + * also need to obtain dst_pmdval and recheck pmd_same() later. */ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); @@ -1330,7 +1326,7 @@ retry: goto out; } - /* If PTE changed after we locked the folio them start over */ + /* If PTE changed after we locked the folio then start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { ret = -EAGAIN; goto out; @@ -381,7 +381,7 @@ again: fput(vp->file); } if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); + unlink_anon_vmas(vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) @@ -530,7 +530,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (err) goto out_free_vmi; - err = anon_vma_clone(new, vma); + err = anon_vma_clone(new, vma, VMA_OP_SPLIT); if (err) goto out_free_mpol; @@ -628,7 +628,7 @@ static int dup_anon_vma(struct vm_area_struct *dst, vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); + ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED); if (ret) return ret; @@ -1901,7 +1901,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) + if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); @@ -2951,10 +2951,10 @@ retry: return -ENOMEM; /* - * Adjust for the gap first so it doesn't interfere with the - * later alignment. The first step is the minimum needed to - * fulill the start gap, the next steps is the minimum to align - * that. It is the minimum needed to fulill both. + * Adjust for the gap first so it doesn't interfere with the later + * alignment. The first step is the minimum needed to fulfill the start + * gap, the next step is the minimum to align that. It is the minimum + * needed to fulfill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; @@ -267,7 +267,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /** - * vma_modify_flags() - Peform any necessary split/merge in preparation for + * vma_modify_flags() - Perform any necessary split/merge in preparation for * setting VMA flags to *@vm_flags in the range @start to @end contained within * @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -295,7 +295,7 @@ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, vm_flags_t *vm_flags_ptr); /** - * vma_modify_name() - Peform any necessary split/merge in preparation for + * vma_modify_name() - Perform any necessary split/merge in preparation for * setting anonymous VMA name to @new_name in the range @start to @end contained * within @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -319,7 +319,7 @@ __must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct anon_vma_name *new_name); /** - * vma_modify_policy() - Peform any necessary split/merge in preparation for + * vma_modify_policy() - Perform any necessary split/merge in preparation for * setting NUMA policy to @new_pol in the range @start to @end contained * within @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -343,7 +343,7 @@ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct mempolicy *new_pol); /** - * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for + * vma_modify_flags_uffd() - Perform any necessary split/merge in preparation for * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range * @start to @end contained within @vma. * @vmi: Valid VMA iterator positioned at @vma. @@ -561,12 +561,6 @@ static inline unsigned long vma_iter_end(struct vma_iterator *vmi) return vmi->mas.last + 1; } -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e286c2d2068c..03e1117480d5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { if (unlikely(!pte_none(ptep_get(pte)))) { @@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -305,6 +305,11 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, int err; pgtbl_mod_mask mask = 0; + /* + * Might allocate pagetables (for most archs a more precise annotation + * would be might_alloc(GFP_PGTABLE_KERNEL)). Also might shootdown TLB + * (requires IRQs enabled on x86). + */ might_sleep(); BUG_ON(addr >= end); @@ -366,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { #ifdef CONFIG_HUGETLB_PAGE @@ -385,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; } @@ -533,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { struct page *page = pages[*nr]; @@ -555,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return err; @@ -2268,11 +2273,14 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) reclaim_list_global(&decay_list); } +#define KASAN_RELEASE_BATCH_SIZE 32 + static void kasan_release_vmalloc_node(struct vmap_node *vn) { struct vmap_area *va; unsigned long start, end; + unsigned int batch_count = 0; start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; @@ -2282,6 +2290,11 @@ kasan_release_vmalloc_node(struct vmap_node *vn) kasan_release_vmalloc(va->va_start, va->va_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE); + + if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) { + cond_resched(); + batch_count = 0; + } } kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); @@ -4354,6 +4367,7 @@ need_realloc: return n; } +EXPORT_SYMBOL(vrealloc_node_align_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) diff --git a/mm/vmscan.c b/mm/vmscan.c index 614ccf39fe3f..3fc4a4461927 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,7 +63,6 @@ #include <asm/div64.h> #include <linux/swapops.h> -#include <linux/balloon_compaction.h> #include <linux/sched/sysctl.h> #include "internal.h" @@ -104,13 +103,13 @@ struct scan_control { unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; - /* Writepage batching in laptop mode; RECLAIM_WRITE */ + /* zone_reclaim_mode, boost reclaim */ unsigned int may_writepage:1; - /* Can mapped folios be reclaimed? */ + /* zone_reclaim_mode */ unsigned int may_unmap:1; - /* Can folios be swapped as part of reclaim? */ + /* zome_reclaim_mode, boost reclaim, cgroup restrictions */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ @@ -507,7 +506,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; /* @@ -758,10 +757,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); + __swap_cache_del_folio(ci, folio, swap, shadow); swap_cluster_unlock_irq(ci); - put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -1063,7 +1061,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. - * ->flags can be updated non-atomicially (scan_swap_map_slots), + * ->flags can be updated non-atomically (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ @@ -1276,58 +1274,58 @@ retry: * Try to allocate it some swap space here. * Lazyfree folio could be freed directly */ - if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { - if (!folio_test_swapcache(folio)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (folio_maybe_dma_pinned(folio)) - goto keep_locked; - if (folio_test_large(folio)) { - /* cannot split folio, skip it */ - if (folio_expected_ref_count(folio) != - folio_ref_count(folio) - 1) - goto activate_locked; - /* - * Split partially mapped folios right away. - * We can free the unmapped pages without IO. - */ - if (data_race(!list_empty(&folio->_deferred_list) && - folio_test_partially_mapped(folio)) && - split_folio_to_list(folio, folio_list)) - goto activate_locked; - } - if (folio_alloc_swap(folio)) { - int __maybe_unused order = folio_order(folio); - - if (!folio_test_large(folio)) - goto activate_locked_split; - /* Fallback to swap normal pages */ - if (split_folio_to_list(folio, folio_list)) - goto activate_locked; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (nr_pages >= HPAGE_PMD_NR) { - count_memcg_folio_events(folio, - THP_SWPOUT_FALLBACK, 1); - count_vm_event(THP_SWPOUT_FALLBACK); - } -#endif - count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); - if (folio_alloc_swap(folio)) - goto activate_locked_split; - } + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (folio_maybe_dma_pinned(folio)) + goto keep_locked; + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ + if (folio_expected_ref_count(folio) != + folio_ref_count(folio) - 1) + goto activate_locked; /* - * Normally the folio will be dirtied in unmap because its - * pte should be dirty. A special case is MADV_FREE page. The - * page's pte could have dirty bit cleared but the folio's - * SwapBacked flag is still set because clearing the dirty bit - * and SwapBacked flag has no lock protected. For such folio, - * unmap will not set dirty bit for it, so folio reclaim will - * not write the folio out. This can cause data corruption when - * the folio is swapped in later. Always setting the dirty flag - * for the folio solves the problem. + * Split partially mapped folios right away. + * We can free the unmapped pages without IO. */ - folio_mark_dirty(folio); + if (data_race(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) && + split_folio_to_list(folio, folio_list)) + goto activate_locked; } + if (folio_alloc_swap(folio)) { + int __maybe_unused order = folio_order(folio); + + if (!folio_test_large(folio)) + goto activate_locked_split; + /* Fallback to swap normal pages */ + if (split_folio_to_list(folio, folio_list)) + goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages >= HPAGE_PMD_NR) { + count_memcg_folio_events(folio, + THP_SWPOUT_FALLBACK, 1); + count_vm_event(THP_SWPOUT_FALLBACK); + } +#endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); + if (folio_alloc_swap(folio)) + goto activate_locked_split; + } + /* + * Normally the folio will be dirtied in unmap because + * its pte should be dirty. A special case is MADV_FREE + * page. The page's pte could have dirty bit cleared but + * the folio's SwapBacked flag is still set because + * clearing the dirty bit and SwapBacked flag has no + * lock protected. For such folio, unmap will not set + * dirty bit for it, so folio reclaim will not write the + * folio out. This can cause data corruption when the + * folio is swapped in later. Always setting the dirty + * flag for the folio solves the problem. + */ + folio_mark_dirty(folio); } /* @@ -2451,9 +2449,9 @@ static inline void calculate_pressure_balance(struct scan_control *sc, static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, struct scan_control *sc, unsigned long scan) { - unsigned long min, low; + unsigned long min, low, usage; - mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low, &usage); if (min || low) { /* @@ -2485,7 +2483,6 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, * again by how much of the total memory used is under * hard protection. */ - unsigned long cgroup_size = mem_cgroup_size(memcg); unsigned long protection; /* memory.low scaling, make sure we retry before OOM */ @@ -2497,9 +2494,9 @@ static unsigned long apply_proportional_protection(struct mem_cgroup *memcg, } /* Avoid TOCTOU with earlier protection check */ - cgroup_size = max(cgroup_size, protection); + usage = max(usage, protection); - scan -= scan * protection / (cgroup_size + 1); + scan -= scan * protection / (usage + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep @@ -3516,7 +3513,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -3557,7 +3554,7 @@ restart: if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); @@ -3598,7 +3595,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!spin_trylock(ptl)) goto done; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { unsigned long pfn; @@ -3645,7 +3642,7 @@ next: walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); spin_unlock(ptl); done: *first = -1; @@ -4244,7 +4241,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); pte -= (addr - start) / PAGE_SIZE; @@ -4278,7 +4275,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) walk_update_folio(walk, last, gen, dirty); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) @@ -5067,7 +5064,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); } /****************************************************************************** @@ -5417,7 +5414,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) if (memcg) cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); #endif - seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + seq_printf(m, "memcg %llu %s\n", mem_cgroup_id(memcg), path); } seq_printf(m, " node %5d\n", nid); @@ -5502,7 +5499,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co return -EINTR; } -static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, +static int run_cmd(char cmd, u64 memcg_id, int nid, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long opt) { struct lruvec *lruvec; @@ -5513,14 +5510,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, return -EINVAL; if (!mem_cgroup_disabled()) { - rcu_read_lock(); - - memcg = mem_cgroup_from_id(memcg_id); - if (!mem_cgroup_tryget(memcg)) - memcg = NULL; - - rcu_read_unlock(); - + memcg = mem_cgroup_get_from_id(memcg_id); if (!memcg) return -EINVAL; } @@ -5592,7 +5582,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, int n; int end; char cmd, swap_string[5]; - unsigned int memcg_id; + u64 memcg_id; unsigned int nid; unsigned long seq; unsigned int swappiness; @@ -5602,7 +5592,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, if (!*cur) continue; - n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, + n = sscanf(cur, "%c %llu %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid, &seq, &end, swap_string, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; @@ -6141,7 +6131,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - atomic_set(&pgdat->kswapd_failures, 0); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -6366,13 +6356,6 @@ retry: if (sc->compaction_ready) break; - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc->priority < DEF_PRIORITY - 2) - sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; @@ -6453,7 +6436,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6581,7 +6564,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = 1, }; @@ -6625,7 +6608,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, @@ -6671,7 +6654,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), @@ -6862,7 +6845,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7052,7 +7035,7 @@ restart: * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ - sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_writepage = !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* @@ -7062,13 +7045,6 @@ restart: */ kswapd_age_node(pgdat, &sc); - /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; @@ -7134,8 +7110,11 @@ restart: * watermark_high at this point. We need to avoid increasing the * failure count to prevent the kswapd thread from stopping. */ - if (!sc.nr_reclaimed && !boosted) - atomic_inc(&pgdat->kswapd_failures); + if (!sc.nr_reclaimed && !boosted) { + int fail_cnt = atomic_inc_return(&pgdat->kswapd_failures); + /* kswapd context, low overhead to trace every failure */ + trace_mm_vmscan_kswapd_reclaim_fail(pgdat->node_id, fail_cnt); + } out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7394,7 +7373,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || + if (kswapd_test_hopeless(pgdat) || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* @@ -7414,6 +7393,32 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, wake_up_interruptible(&pgdat->kswapd_wait); } +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason) +{ + /* Only trace actual resets, not redundant zero-to-zero */ + if (atomic_xchg(&pgdat->kswapd_failures, 0)) + trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason); +} + +/* + * Reset kswapd_failures only when the node is balanced. Without this + * check, successful direct reclaim (e.g., from cgroup memory.high + * throttling) can keep resetting kswapd_failures even when the node + * cannot be balanced, causing kswapd to run endlessly. + */ +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx) +{ + if (pgdat_balanced(pgdat, order, highest_zoneidx)) + kswapd_clear_hopeless(pgdat, current_is_kswapd() ? + KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT); +} + +bool kswapd_test_hopeless(pg_data_t *pgdat) +{ + return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES; +} + #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -7465,8 +7470,8 @@ void __meminit kswapd_run(int nid) pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ - pr_err("Failed to start kswapd on node %d,ret=%ld\n", - nid, PTR_ERR(pgdat->kswapd)); + pr_err("Failed to start kswapd on node %d, ret=%pe\n", + nid, pgdat->kswapd); BUG_ON(system_state < SYSTEM_RUNNING); pgdat->kswapd = NULL; } else { @@ -7800,7 +7805,7 @@ int user_proactive_reclaim(char *buf, .reclaim_idx = gfp_zone(gfp_mask), .proactive_swappiness = swappiness == -1 ? NULL : &swappiness, .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, + .may_writepage = 1, .nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX), .may_unmap = 1, .may_swap = 1, diff --git a/mm/vmstat.c b/mm/vmstat.c index d6e814c82952..86b14b0f77b5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -672,11 +672,6 @@ void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } EXPORT_SYMBOL(mod_node_page_state); -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -{ - mod_node_state(pgdat, item, 1, 1); -} - void inc_node_page_state(struct page *page, enum node_stat_item item) { mod_node_state(page_pgdat(page), item, 1, 1); @@ -725,16 +720,6 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(dec_zone_page_state); -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) -{ - unsigned long flags; - - local_irq_save(flags); - __inc_node_state(pgdat, item); - local_irq_restore(flags); -} -EXPORT_SYMBOL(inc_node_state); - void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long delta) { @@ -1434,13 +1419,13 @@ const char * const vmstat_text[] = { [I(THP_SWPOUT)] = "thp_swpout", [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback", #endif -#ifdef CONFIG_MEMORY_BALLOON +#ifdef CONFIG_BALLOON [I(BALLOON_INFLATE)] = "balloon_inflate", [I(BALLOON_DEFLATE)] = "balloon_deflate", -#ifdef CONFIG_BALLOON_COMPACTION +#ifdef CONFIG_BALLOON_MIGRATION [I(BALLOON_MIGRATE)] = "balloon_migrate", -#endif -#endif /* CONFIG_MEMORY_BALLOON */ +#endif /* CONFIG_BALLOON_MIGRATION */ +#endif /* CONFIG_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush", [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received", @@ -1626,7 +1611,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, } } -/* Print out the free pages at each order for each migatetype */ +/* Print out the free pages at each order for each migratetype */ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) { int order; @@ -1855,7 +1840,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n start_pfn: %lu" "\n reserved_highatomic: %lu" "\n free_highatomic: %lu", - atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, + kswapd_test_hopeless(pgdat), zone->zone_start_pfn, zone->nr_reserved_highatomic, zone->nr_free_highatomic); @@ -2281,7 +2266,8 @@ void __init init_mm_internals(void) { int ret __maybe_unused; - mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); #ifdef CONFIG_SMP ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", diff --git a/mm/workingset.c b/mm/workingset.c index e9f05634747a..13422d304715 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -254,7 +254,7 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); + return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); } /* @@ -271,7 +271,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); - memcg = mem_cgroup_from_id(memcg_id); + memcg = mem_cgroup_from_private_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); @@ -395,7 +395,7 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); @@ -456,7 +456,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * would be better if the root_mem_cgroup existed in all * configurations instead. */ - eviction_memcg = mem_cgroup_from_id(memcgid); + eviction_memcg = mem_cgroup_from_private_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5bf832f9c05c..d5d1c27b3852 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -30,6 +30,7 @@ #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/scatterlist.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/shrinker.h> @@ -105,7 +106,7 @@ /* * On systems with 4K page size, this gives 255 size classes! There is a - * trader-off here: + * trade-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes * - Small number of size classes causes large internal fragmentation @@ -192,12 +193,13 @@ struct link_free { }; }; +static struct kmem_cache *handle_cachep; +static struct kmem_cache *zspage_cachep; + struct zs_pool { const char *name; struct size_class *size_class[ZS_SIZE_CLASSES]; - struct kmem_cache *handle_cachep; - struct kmem_cache *zspage_cachep; atomic_long_t pages_allocated; @@ -370,60 +372,28 @@ static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} #endif -static int create_cache(struct zs_pool *pool) +static unsigned long cache_alloc_handle(gfp_t gfp) { - char *name; - - name = kasprintf(GFP_KERNEL, "zs_handle-%s", pool->name); - if (!name) - return -ENOMEM; - pool->handle_cachep = kmem_cache_create(name, ZS_HANDLE_SIZE, - 0, 0, NULL); - kfree(name); - if (!pool->handle_cachep) - return -EINVAL; - - name = kasprintf(GFP_KERNEL, "zspage-%s", pool->name); - if (!name) - return -ENOMEM; - pool->zspage_cachep = kmem_cache_create(name, sizeof(struct zspage), - 0, 0, NULL); - kfree(name); - if (!pool->zspage_cachep) { - kmem_cache_destroy(pool->handle_cachep); - pool->handle_cachep = NULL; - return -EINVAL; - } - - return 0; -} + gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE); -static void destroy_cache(struct zs_pool *pool) -{ - kmem_cache_destroy(pool->handle_cachep); - kmem_cache_destroy(pool->zspage_cachep); + return (unsigned long)kmem_cache_alloc(handle_cachep, gfp); } -static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) +static void cache_free_handle(unsigned long handle) { - return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + kmem_cache_free(handle_cachep, (void *)handle); } -static void cache_free_handle(struct zs_pool *pool, unsigned long handle) +static struct zspage *cache_alloc_zspage(gfp_t gfp) { - kmem_cache_free(pool->handle_cachep, (void *)handle); -} + gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE); -static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) -{ - return kmem_cache_zalloc(pool->zspage_cachep, - flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + return kmem_cache_zalloc(zspage_cachep, gfp); } -static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +static void cache_free_zspage(struct zspage *zspage) { - kmem_cache_free(pool->zspage_cachep, zspage); + kmem_cache_free(zspage_cachep, zspage); } /* class->lock(which owns the handle) synchronizes races */ @@ -852,7 +822,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, zpdesc = next; } while (zpdesc != NULL); - cache_free_zspage(pool, zspage); + cache_free_zspage(zspage); class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); @@ -965,7 +935,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, { int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; - struct zspage *zspage = cache_alloc_zspage(pool, gfp); + struct zspage *zspage = cache_alloc_zspage(gfp); if (!zspage) return NULL; @@ -987,7 +957,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, zpdesc_dec_zone_page_state(zpdescs[i]); free_zpdesc(zpdescs[i]); } - cache_free_zspage(pool, zspage); + cache_free_zspage(zspage); return NULL; } __zpdesc_set_zsmalloc(zpdesc); @@ -1065,7 +1035,7 @@ unsigned long zs_get_total_pages(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_get_total_pages); void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, - void *local_copy) + size_t mem_len, void *local_copy) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1087,7 +1057,10 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ addr = kmap_local_zpdesc(zpdesc); addr += off; @@ -1096,7 +1069,7 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, /* this object spans two pages */ sizes[0] = PAGE_SIZE - off; - sizes[1] = class->size - sizes[0]; + sizes[1] = mem_len - sizes[0]; addr = local_copy; memcpy_from_page(addr, zpdesc_page(zpdesc), @@ -1107,15 +1080,12 @@ void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, 0, sizes[1]); } - if (!ZsHugePage(zspage)) - addr += ZS_HANDLE_SIZE; - return addr; } EXPORT_SYMBOL_GPL(zs_obj_read_begin); void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, - void *handle_mem) + size_t mem_len, void *handle_mem) { struct zspage *zspage; struct zpdesc *zpdesc; @@ -1129,9 +1099,10 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); - if (off + class->size <= PAGE_SIZE) { - if (!ZsHugePage(zspage)) - off += ZS_HANDLE_SIZE; + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { handle_mem -= off; kunmap_local(handle_mem); } @@ -1140,6 +1111,68 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle, } EXPORT_SYMBOL_GPL(zs_obj_read_end); +void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle, + struct scatterlist *sg, size_t mem_len) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj, off; + unsigned int obj_idx; + struct size_class *class; + + /* Guarantee we can get zspage from handle safely */ + read_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + /* Make sure migration doesn't move any pages in this zspage */ + zspage_read_lock(zspage); + read_unlock(&pool->lock); + + class = zspage_class(pool, zspage); + off = offset_in_page(class->size * obj_idx); + + if (!ZsHugePage(zspage)) + off += ZS_HANDLE_SIZE; + + if (off + mem_len <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + sg_init_table(sg, 1); + sg_set_page(sg, zpdesc_page(zpdesc), mem_len, off); + } else { + size_t sizes[2]; + + /* this object spans two pages */ + sizes[0] = PAGE_SIZE - off; + sizes[1] = mem_len - sizes[0]; + + sg_init_table(sg, 2); + sg_set_page(sg, zpdesc_page(zpdesc), sizes[0], off); + + zpdesc = get_next_zpdesc(zpdesc); + sg = sg_next(sg); + + sg_set_page(sg, zpdesc_page(zpdesc), sizes[1], 0); + } +} +EXPORT_SYMBOL_GPL(zs_obj_read_sg_begin); + +void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle) +{ + struct zspage *zspage; + struct zpdesc *zpdesc; + unsigned long obj; + unsigned int obj_idx; + + obj = handle_to_obj(handle); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); + + zspage_read_unlock(zspage); +} +EXPORT_SYMBOL_GPL(zs_obj_read_sg_end); + void zs_obj_write(struct zs_pool *pool, unsigned long handle, void *handle_mem, size_t mem_len) { @@ -1275,7 +1308,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, if (unlikely(size > ZS_MAX_ALLOC_SIZE)) return (unsigned long)ERR_PTR(-ENOSPC); - handle = cache_alloc_handle(pool, gfp); + handle = cache_alloc_handle(gfp); if (!handle) return (unsigned long)ERR_PTR(-ENOMEM); @@ -1299,7 +1332,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, zspage = alloc_zspage(pool, class, gfp, nid); if (!zspage) { - cache_free_handle(pool, handle); + cache_free_handle(handle); return (unsigned long)ERR_PTR(-ENOMEM); } @@ -1379,7 +1412,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) free_zspage(pool, class, zspage); spin_unlock(&class->lock); - cache_free_handle(pool, handle); + cache_free_handle(handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -2041,9 +2074,6 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; - if (create_cache(pool)) - goto err; - /* * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. @@ -2165,20 +2195,47 @@ void zs_destroy_pool(struct zs_pool *pool) kfree(class); } - destroy_cache(pool); kfree(pool->name); kfree(pool); } EXPORT_SYMBOL_GPL(zs_destroy_pool); +static void zs_destroy_caches(void) +{ + kmem_cache_destroy(handle_cachep); + handle_cachep = NULL; + kmem_cache_destroy(zspage_cachep); + zspage_cachep = NULL; +} + +static int __init zs_init_caches(void) +{ + handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), + 0, 0, NULL); + + if (!handle_cachep || !zspage_cachep) { + zs_destroy_caches(); + return -ENOMEM; + } + return 0; +} + static int __init zs_init(void) { - int rc __maybe_unused; + int rc; + + rc = zs_init_caches(); + if (rc) + return rc; #ifdef CONFIG_COMPACTION rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); - if (rc) + if (rc) { + zs_destroy_caches(); return rc; + } #endif zs_stat_init(); return 0; @@ -2190,6 +2247,7 @@ static void __exit zs_exit(void) set_movable_ops(NULL, PGTY_zsmalloc); #endif zs_stat_exit(); + zs_destroy_caches(); } module_init(zs_init); diff --git a/mm/zswap.c b/mm/zswap.c index ac9b7a60736b..af3f0fbb0558 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -26,6 +26,7 @@ #include <linux/mempolicy.h> #include <linux/mempool.h> #include <crypto/acompress.h> +#include <crypto/scatterwalk.h> #include <linux/zswap.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -141,7 +142,6 @@ struct crypto_acomp_ctx { struct crypto_wait wait; u8 *buffer; struct mutex mutex; - bool is_sleepable; }; /* @@ -749,8 +749,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { - pr_err("could not alloc crypto acomp %s : %ld\n", - pool->tfm_name, PTR_ERR(acomp)); + pr_err("could not alloc crypto acomp %s : %pe\n", + pool->tfm_name, acomp); ret = PTR_ERR(acomp); goto fail; } @@ -781,7 +781,6 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_ctx->buffer = buffer; acomp_ctx->acomp = acomp; - acomp_ctx->is_sleepable = acomp_is_async(acomp); acomp_ctx->req = req; mutex_unlock(&acomp_ctx->mutex); return 0; @@ -933,52 +932,41 @@ unlock: static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { struct zswap_pool *pool = entry->pool; - struct scatterlist input, output; + struct scatterlist input[2]; /* zsmalloc returns an SG list 1-2 entries */ + struct scatterlist output; struct crypto_acomp_ctx *acomp_ctx; - int decomp_ret = 0, dlen = PAGE_SIZE; - u8 *src, *obj; + int ret = 0, dlen; acomp_ctx = acomp_ctx_get_cpu_lock(pool); - obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer); + zs_obj_read_sg_begin(pool->zs_pool, entry->handle, input, entry->length); /* zswap entries of length PAGE_SIZE are not compressed. */ if (entry->length == PAGE_SIZE) { - memcpy_to_folio(folio, 0, obj, entry->length); - goto read_done; - } - - /* - * zs_obj_read_begin() might return a kmap address of highmem when - * acomp_ctx->buffer is not used. However, sg_init_one() does not - * handle highmem addresses, so copy the object to acomp_ctx->buffer. - */ - if (virt_addr_valid(obj)) { - src = obj; + WARN_ON_ONCE(input->length != PAGE_SIZE); + memcpy_from_sglist(kmap_local_folio(folio, 0), input, 0, PAGE_SIZE); + dlen = PAGE_SIZE; } else { - WARN_ON_ONCE(obj == acomp_ctx->buffer); - memcpy(acomp_ctx->buffer, obj, entry->length); - src = acomp_ctx->buffer; + sg_init_table(&output, 1); + sg_set_folio(&output, folio, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, input, &output, + entry->length, PAGE_SIZE); + ret = crypto_acomp_decompress(acomp_ctx->req); + ret = crypto_wait_req(ret, &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; } - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_folio(&output, folio, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); - decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - -read_done: - zs_obj_read_end(pool->zs_pool, entry->handle, obj); + zs_obj_read_sg_end(pool->zs_pool, entry->handle); acomp_ctx_put_unlock(acomp_ctx); - if (!decomp_ret && dlen == PAGE_SIZE) + if (!ret && dlen == PAGE_SIZE) return true; zswap_decompress_fail++; pr_alert_ratelimited("Decompression error from zswap (%d:%lu %s %u->%d)\n", swp_type(entry->swpentry), swp_offset(entry->swpentry), - entry->pool->tfm_name, entry->length, dlen); + entry->pool->tfm_name, + entry->length, dlen); return false; } @@ -1014,8 +1002,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry, return -EEXIST; mpol = get_task_policy(current); - folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated); put_swap_device(si); if (!folio) return -ENOMEM; |
