diff options
Diffstat (limited to 'mm')
101 files changed, 6211 insertions, 5276 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 84000b016808..0b7f4bb5cb80 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -242,6 +242,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT @@ -550,20 +554,63 @@ menuconfig MEMORY_HOTPLUG if MEMORY_HOTPLUG -config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG +choice + prompt "Memory Hotplug Default Online Type" + default MHP_DEFAULT_ONLINE_TYPE_OFFLINE help + Default memory type for hotplugged memory. + This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting can always be changed at runtime. + + The default is 'offline'. + + Select offline to defer onlining to drivers and user policy. + Select auto to let the kernel choose what zones to utilize. + Select online_kernel to generally allow kernel usage of this memory. + Select online_movable to generally disallow kernel usage of this memory. + + Example kernel usage would be page structs and page tables. + See Documentation/admin-guide/mm/memory-hotplug.rst for more information. - Say Y here if you want all hot-plugged memory blocks to appear in - 'online' state by default. - Say N here if you want the default policy to keep all hot-plugged - memory blocks in 'offline' state. +config MHP_DEFAULT_ONLINE_TYPE_OFFLINE + bool "offline" + help + Hotplugged memory will not be onlined by default. + Choose this for systems with drivers and user policy that + handle onlining of hotplug memory policy. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO + bool "auto" + help + Select this if you want the kernel to automatically online + hotplugged memory into the zone it thinks is reasonable. + This memory may be utilized for kernel data. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL + bool "kernel" + help + Select this if you want the kernel to automatically online + hotplugged memory into a zone capable of being used for kernel + data. This typically means ZONE_NORMAL. + +config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE + bool "movable" + help + Select this if you want the kernel to automatically online + hotplug memory into ZONE_MOVABLE. This memory will generally + not be utilized for kernel data. + + This should only be used when the admin knows sufficient + ZONE_NORMAL memory is available to describe hotplug memory, + otherwise hotplug memory may fail to online. For example, + sufficient kernel-capable memory (ZONE_NORMAL) must be + available to allocate page structs to describe ZONE_MOVABLE. + +endchoice config MEMORY_HOTREMOVE bool "Allow for memory hot remove" @@ -1301,6 +1348,21 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). +config ARCH_SUPPORTS_PT_RECLAIM + def_bool n + +config PT_RECLAIM + bool "reclaim empty user page table pages" + default y + depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP + select MMU_GATHER_RCU_TABLE_FREE + help + Try to reclaim empty user page table pages in paths other than munmap + and exit_mmap path. + + Note: now only empty user PTE page table pages will be reclaimed. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index dba52bb0da8a..850386a67b3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -146,3 +146,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o +obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o @@ -36,7 +36,7 @@ struct cma { }; extern struct cma cma_areas[MAX_CMA_AREAS]; -extern unsigned cma_area_count; +extern unsigned int cma_area_count; static inline unsigned long cma_bitmap_maxno(struct cma *cma) { diff --git a/mm/compaction.c b/mm/compaction.c index a2b16b08cbbf..a3203d97123e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; } static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); return page; } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) @@ -630,7 +631,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (blockpfn + (1UL << order) <= end_pfn) { + if ((order <= MAX_PAGE_ORDER) && + (blockpfn + (1UL << order) <= end_pfn)) { blockpfn += (1UL << order) - 1; page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; @@ -1868,6 +1870,7 @@ again: dst = (struct folio *)freepage; post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + set_page_refcounted(&dst->page); if (order) prep_compound_page(&dst->page, order); cc->nr_freepages -= 1 << order; @@ -2488,7 +2491,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ static enum compact_result compaction_suit_allocation_order(struct zone *zone, unsigned int order, - int highest_zoneidx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags, + bool async) { unsigned long watermark; @@ -2497,6 +2501,23 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order, alloc_flags)) return COMPACT_SUCCESS; + /* + * For unmovable allocations (without ALLOC_CMA), check if there is enough + * free memory in the non-CMA pageblocks. Otherwise compaction could form + * the high-order page in CMA pageblocks, which would not help the + * allocation to succeed. However, limit the check to costly order async + * compaction (such as opportunistic THP attempts) because there is the + * possibility that compaction would migrate pages from non-CMA to CMA + * pageblock. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && async && + !(alloc_flags & ALLOC_CMA)) { + watermark = low_wmark_pages(zone) + compact_gap(order); + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + 0, zone_page_state(zone, NR_FREE_PAGES))) + return COMPACT_SKIPPED; + } + if (!compaction_suitable(zone, order, highest_zoneidx)) return COMPACT_SKIPPED; @@ -2532,7 +2553,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (!is_via_compact_memory(cc->order)) { ret = compaction_suit_allocation_order(cc->zone, cc->order, cc->highest_zoneidx, - cc->alloc_flags); + cc->alloc_flags, + cc->mode == MIGRATE_ASYNC); if (ret != COMPACT_CONTINUE) return ret; } @@ -3035,7 +3057,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) ret = compaction_suit_allocation_order(zone, pgdat->kcompactd_max_order, - highest_zoneidx, ALLOC_WMARK_MIN); + highest_zoneidx, ALLOC_WMARK_MIN, + false); if (ret == COMPACT_CONTINUE) return true; } @@ -3076,7 +3099,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) continue; ret = compaction_suit_allocation_order(zone, - cc.order, zoneid, ALLOC_WMARK_MIN); + cc.order, zoneid, ALLOC_WMARK_MIN, + false); if (ret != COMPACT_CONTINUE) continue; @@ -3154,15 +3178,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; - struct task_struct *tsk = current; long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); long timeout = default_timeout; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); - + current->flags |= PF_KCOMPACTD; set_freezable(); pgdat->kcompactd_max_order = 0; @@ -3219,6 +3238,8 @@ static int kcompactd(void *p) pgdat->proactive_compact_trigger = false; } + current->flags &= ~PF_KCOMPACTD; + return 0; } @@ -3233,10 +3254,12 @@ void __meminit kcompactd_run(int nid) if (pgdat->kcompactd) return; - pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid); if (IS_ERR(pgdat->kcompactd)) { pr_err("Failed to start kcompactd on node %d\n", nid); pgdat->kcompactd = NULL; + } else { + wake_up_process(pgdat->kcompactd); } } @@ -3254,30 +3277,6 @@ void __meminit kcompactd_stop(int nid) } } -/* - * It's optimal to keep kcompactd on the same CPUs as their memory, but - * not required for correctness. So if the last cpu in a node goes - * away, we get changed to run anywhere: as the first one comes back, - * restore their cpu bindings. - */ -static int kcompactd_cpu_online(unsigned int cpu) -{ - int nid; - - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; - - mask = cpumask_of_node(pgdat->node_id); - - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - if (pgdat->kcompactd) - set_cpus_allowed_ptr(pgdat->kcompactd, mask); - } - return 0; -} - static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3297,7 +3296,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, return ret; } -static struct ctl_table vm_compaction[] = { +static const struct ctl_table vm_compaction[] = { { .procname = "compact_memory", .data = &sysctl_compact_memory, @@ -3337,15 +3336,6 @@ static struct ctl_table vm_compaction[] = { static int __init kcompactd_init(void) { int nid; - int ret; - - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/compaction:online", - kcompactd_cpu_online, NULL); - if (ret < 0) { - pr_err("kcompactd: failed to register hotplug callbacks.\n"); - return ret; - } for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index d0357f3e9372..c213cf8b5638 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -71,36 +71,6 @@ config DAMON_SYSFS_KUNIT_TEST If unsure, say N. -config DAMON_DBGFS_DEPRECATED - bool "DAMON debugfs interface (DEPRECATED!)" - depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS - help - This builds the debugfs interface for DAMON. The user space admins - can use the interface for arbitrary data access monitoring. - - If unsure, say N. - - This is deprecated, so users should move to the sysfs interface - (DAMON_SYSFS). If you depend on this and cannot move, please report - your usecase to damon@lists.linux.dev and linux-mm@kvack.org. - -config DAMON_DBGFS - bool - default y - depends on DAMON_DBGFS_DEPRECATED - -config DAMON_DBGFS_KUNIT_TEST - bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS - depends on DAMON_DBGFS && KUNIT=y - default KUNIT_ALL_TESTS - help - This builds the DAMON debugfs interface Kunit test suite. - - For more information on KUnit and unit tests in general, please refer - to the KUnit documentation. - - If unsure, say N. - config DAMON_RECLAIM bool "Build DAMON-based reclaim (DAMON_RECLAIM)" depends on DAMON_PADDR diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f7add3f4aa79..8b49012ba8c3 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -4,6 +4,5 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o -obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 0776452a1abb..384935ef4e65 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -14,6 +14,7 @@ #include <linux/psi.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/string_choices.h> #define CREATE_TRACE_POINTS #include <trace/events/damon.h> @@ -266,7 +267,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, } struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching) + bool matching, bool allow) { struct damos_filter *filter; @@ -275,6 +276,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return NULL; filter->type = type; filter->matching = matching; + filter->allow = allow; INIT_LIST_HEAD(&filter->list); return filter; } @@ -371,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, * or damon_attrs are updated. */ scheme->next_apply_sis = 0; + scheme->walk_completed = false; INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -504,6 +507,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); + mutex_init(&ctx->call_control_lock); + mutex_init(&ctx->walk_control_lock); ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; @@ -803,7 +808,8 @@ static int damos_commit_filters(struct damos *dst, struct damos *src) continue; new_filter = damos_new_filter( - src_filter->type, src_filter->matching); + src_filter->type, src_filter->matching, + src_filter->allow); if (!new_filter) return -ENOMEM; damos_commit_filter_arg(new_filter, src_filter); @@ -1162,6 +1168,94 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } +static bool damon_is_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +/** + * damon_call() - Invoke a given function on DAMON worker thread (kdamond). + * @ctx: DAMON context to call the function for. + * @control: Control variable of the call request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function with an + * argument data that respectively passed via &damon_call_control->fn and + * &damon_call_control->data of @control, and wait until the kdamond finishes + * handling of the request. + * + * The kdamond executes the function with the argument in the main loop, just + * after a sampling of the iteration is finished. The function can hence + * safely access the internal data of the &struct damon_ctx without additional + * synchronization. The return value of the function will be saved in + * &damon_call_control->return_code. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + + mutex_lock(&ctx->call_control_lock); + if (ctx->call_control) { + mutex_unlock(&ctx->call_control_lock); + return -EBUSY; + } + ctx->call_control = control; + mutex_unlock(&ctx->call_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + +/** + * damos_walk() - Invoke a given functions while DAMOS walk regions. + * @ctx: DAMON context to call the functions for. + * @control: Control variable of the walk request. + * + * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region + * that the kdamond will apply DAMOS action to, and wait until the kdamond + * finishes handling of the request. + * + * The kdamond executes the given function in the main loop, for each region + * just after it applied any DAMOS actions of @ctx to it. The invocation is + * made only within one &damos->apply_interval_us since damos_walk() + * invocation, for each scheme. The given callback function can hence safely + * access the internal data of &struct damon_ctx and &struct damon_region that + * each of the scheme will apply the action for next interval, without + * additional synchronizations against the kdamond. If every scheme of @ctx + * passed at least one &damos->apply_interval_us, kdamond marks the request as + * completed so that damos_walk() can wakeup and return. + * + * Return: 0 on success, negative error code otherwise. + */ +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) +{ + init_completion(&control->completion); + control->canceled = false; + mutex_lock(&ctx->walk_control_lock); + if (ctx->walk_control) { + mutex_unlock(&ctx->walk_control_lock); + return -EBUSY; + } + ctx->walk_control = control; + mutex_unlock(&ctx->walk_control_lock); + if (!damon_is_running(ctx)) + return -EINVAL; + wait_for_completion(&control->completion); + if (control->canceled) + return -ECANCELED; + return 0; +} + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1272,16 +1366,18 @@ static bool damos_skip_charged_region(struct damon_target *t, } static void damos_update_stat(struct damos *s, - unsigned long sz_tried, unsigned long sz_applied) + unsigned long sz_tried, unsigned long sz_applied, + unsigned long sz_ops_filter_passed) { s->stat.nr_tried++; s->stat.sz_tried += sz_tried; if (sz_applied) s->stat.nr_applied++; s->stat.sz_applied += sz_applied; + s->stat.sz_ops_filter_passed += sz_ops_filter_passed; } -static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, +static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos_filter *filter) { bool matched = false; @@ -1334,13 +1430,103 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, { struct damos_filter *filter; + s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (__damos_filter_out(ctx, t, r, filter)) - return true; + if (damos_filter_match(ctx, t, r, filter)) { + if (filter->allow) + s->core_filters_allowed = true; + return !filter->allow; + } } return false; } +/* + * damos_walk_call_walk() - Call &damos_walk_control->walk_fn. + * @ctx: The context of &damon_ctx->walk_control. + * @t: The monitoring target of @r that @s will be applied. + * @r: The region of @t that @s will be applied. + * @s: The scheme of @ctx that will be applied to @r. + * + * This function is called from kdamond whenever it asked the operation set to + * apply a DAMOS scheme action to a region. If a DAMOS walk request is + * installed by damos_walk() and not yet uninstalled, invoke it. + */ +static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + unsigned long sz_filter_passed) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + if (!control) + return; + control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed); +} + +/* + * damos_walk_complete() - Complete DAMOS walk request if all walks are done. + * @ctx: The context of &damon_ctx->walk_control. + * @s: A scheme of @ctx that all walks are now done. + * + * This function is called when kdamond finished applying the action of a DAMOS + * scheme to all regions that eligible for the given &damos->apply_interval_us. + * If every scheme of @ctx including @s now finished walking for at least one + * &damos->apply_interval_us, this function makrs the handling of the given + * DAMOS walk request is done, so that damos_walk() can wake up and return. + */ +static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) +{ + struct damos *siter; + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + if (!control) + return; + + s->walk_completed = true; + /* if all schemes completed, signal completion to walker */ + damon_for_each_scheme(siter, ctx) { + if (!siter->walk_completed) + return; + } + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); +} + +/* + * damos_walk_cancel() - Cancel the current DAMOS walk request. + * @ctx: The context of &damon_ctx->walk_control. + * + * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS + * walk is requested but there is no DAMOS scheme to walk for, or the kdamond + * is already out of the main loop and therefore gonna be terminated, and hence + * cannot continue the walks. This function therefore marks the walk request + * as canceled, so that damos_walk() can wake up and return. + */ +static void damos_walk_cancel(struct damon_ctx *ctx) +{ + struct damos_walk_control *control; + + mutex_lock(&ctx->walk_control_lock); + control = ctx->walk_control; + mutex_unlock(&ctx->walk_control_lock); + + if (!control) + return; + control->canceled = true; + complete(&control->completion); + mutex_lock(&ctx->walk_control_lock); + ctx->walk_control = NULL; + mutex_unlock(&ctx->walk_control_lock); +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -1348,6 +1534,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; + unsigned long sz_ops_filter_passed = 0; int err = 0; /* * We plan to support multiple context per kdamond, as DAMON sysfs @@ -1393,8 +1580,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (!err) { trace_damos_before_apply(cidx, sidx, tidx, r, damon_nr_regions(t), do_trace); - sz_applied = c->ops.apply_scheme(c, t, r, s); + sz_applied = c->ops.apply_scheme(c, t, r, s, + &sz_ops_filter_passed); } + damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -1408,7 +1597,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, r->age = 0; update_stat: - damos_update_stat(s, sz, sz_applied); + damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed); } static void damon_do_apply_schemes(struct damon_ctx *c, @@ -1550,7 +1739,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota) static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; - unsigned long esz; + unsigned long esz = ULONG_MAX; if (!quota->ms && list_empty("a->goals)) { quota->esz = quota->sz; @@ -1572,10 +1761,7 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->total_charged_ns; else throughput = PAGE_SIZE * 1024; - if (!list_empty("a->goals)) - esz = min(throughput * quota->ms, esz); - else - esz = throughput * quota->ms; + esz = min(throughput * quota->ms, esz); } if (quota->sz && quota->sz < esz) @@ -1666,6 +1852,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) damon_for_each_scheme(s, c) { if (c->passed_sample_intervals < s->next_apply_sis) continue; + damos_walk_complete(c, s); s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; @@ -1894,9 +2081,8 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme) if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { if (scheme->wmarks.activated) pr_debug("deactivate a scheme (%d) for %s wmark\n", - scheme->action, - metric > scheme->wmarks.high ? - "high" : "low"); + scheme->action, + str_high_low(metric > scheme->wmarks.high)); scheme->wmarks.activated = false; return scheme->wmarks.interval; } @@ -1920,6 +2106,39 @@ static void kdamond_usleep(unsigned long usecs) usleep_range_idle(usecs, usecs + 1); } +/* + * kdamond_call() - handle damon_call_control. + * @ctx: The &struct damon_ctx of the kdamond. + * @cancel: Whether to cancel the invocation of the function. + * + * If there is a &struct damon_call_control request that registered via + * &damon_call() on @ctx, do or cancel the invocation of the function depending + * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS + * watermarks, or the kdamond is already out of the main loop and therefore + * will be terminated. + */ +static void kdamond_call(struct damon_ctx *ctx, bool cancel) +{ + struct damon_call_control *control; + int ret = 0; + + mutex_lock(&ctx->call_control_lock); + control = ctx->call_control; + mutex_unlock(&ctx->call_control_lock); + if (!control) + return; + if (cancel) { + control->canceled = true; + } else { + ret = control->fn(control->data); + control->return_code = ret; + } + complete(&control->completion); + mutex_lock(&ctx->call_control_lock); + ctx->call_control = NULL; + mutex_unlock(&ctx->call_control_lock); +} + /* Returns negative error code if it's not activated but should return */ static int kdamond_wait_activation(struct damon_ctx *ctx) { @@ -1944,6 +2163,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) if (ctx->callback.after_wmarks_check && ctx->callback.after_wmarks_check(ctx)) break; + kdamond_call(ctx, true); + damos_walk_cancel(ctx); } return -EBUSY; } @@ -2014,6 +2235,7 @@ static int kdamond_fn(void *data) if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) break; + kdamond_call(ctx, false); kdamond_usleep(sample_interval); ctx->passed_sample_intervals++; @@ -2036,6 +2258,8 @@ static int kdamond_fn(void *data) */ if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); + else + damos_walk_cancel(ctx); sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; @@ -2075,6 +2299,9 @@ done: ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); + kdamond_call(ctx, true); + damos_walk_cancel(ctx); + mutex_lock(&damon_lock); nr_running_ctxs--; if (!nr_running_ctxs && running_exclusive_ctxs) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c deleted file mode 100644 index b4213bc47e44..000000000000 --- a/mm/damon/dbgfs.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DAMON Debugfs Interface - * - * Author: SeongJae Park <sj@kernel.org> - */ - -#define pr_fmt(fmt) "damon-dbgfs: " fmt - -#include <linux/damon.h> -#include <linux/debugfs.h> -#include <linux/file.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/page_idle.h> -#include <linux/slab.h> - -#define DAMON_DBGFS_DEPRECATION_NOTICE \ - "DAMON debugfs interface is deprecated, so users should move " \ - "to DAMON_SYSFS. If you cannot, please report your usecase to " \ - "damon@lists.linux.dev and linux-mm@kvack.org.\n" - -static struct damon_ctx **dbgfs_ctxs; -static int dbgfs_nr_ctxs; -static struct dentry **dbgfs_dirs; -static DEFINE_MUTEX(damon_dbgfs_lock); - -static void damon_dbgfs_warn_deprecation(void) -{ - pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE); -} - -/* - * Returns non-empty string on success, negative error code otherwise. - */ -static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - - /* We do not accept continuous write */ - if (*ppos) - return ERR_PTR(-EINVAL); - - kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return ERR_PTR(-ENOMEM); - - ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); - if (ret != count) { - kfree(kbuf); - return ERR_PTR(-EIO); - } - kbuf[ret] = '\0'; - - return kbuf; -} - -static ssize_t dbgfs_attrs_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char kbuf[128]; - int ret; - - mutex_lock(&ctx->kdamond_lock); - ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->attrs.sample_interval, ctx->attrs.aggr_interval, - ctx->attrs.ops_update_interval, - ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); - mutex_unlock(&ctx->kdamond_lock); - - return simple_read_from_buffer(buf, count, ppos, kbuf, ret); -} - -static ssize_t dbgfs_attrs_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - struct damon_attrs attrs; - char *kbuf; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &attrs.sample_interval, &attrs.aggr_interval, - &attrs.ops_update_interval, - &attrs.min_nr_regions, - &attrs.max_nr_regions) != 5) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - ret = damon_set_attrs(ctx, &attrs); - if (!ret) - ret = count; -unlock_out: - mutex_unlock(&ctx->kdamond_lock); -out: - kfree(kbuf); - return ret; -} - -/* - * Return corresponding dbgfs' scheme action value (int) for the given - * damos_action if the given damos_action value is valid and supported by - * dbgfs, negative error code otherwise. - */ -static int damos_action_to_dbgfs_scheme_action(enum damos_action action) -{ - switch (action) { - case DAMOS_WILLNEED: - return 0; - case DAMOS_COLD: - return 1; - case DAMOS_PAGEOUT: - return 2; - case DAMOS_HUGEPAGE: - return 3; - case DAMOS_NOHUGEPAGE: - return 4; - case DAMOS_STAT: - return 5; - default: - return -EINVAL; - } -} - -static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damos *s; - int written = 0; - int rc; - - damon_for_each_scheme(s, c) { - rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->pattern.min_sz_region, - s->pattern.max_sz_region, - s->pattern.min_nr_accesses, - s->pattern.max_nr_accesses, - s->pattern.min_age_region, - s->pattern.max_age_region, - damos_action_to_dbgfs_scheme_action(s->action), - s->quota.ms, s->quota.sz, - s->quota.reset_interval, - s->quota.weight_sz, - s->quota.weight_nr_accesses, - s->quota.weight_age, - s->wmarks.metric, s->wmarks.interval, - s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat.nr_tried, s->stat.sz_tried, - s->stat.nr_applied, s->stat.sz_applied, - s->stat.qt_exceeds); - if (!rc) - return -ENOMEM; - - written += rc; - } - return written; -} - -static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_schemes(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) -{ - ssize_t i; - - for (i = 0; i < nr_schemes; i++) - kfree(schemes[i]); - kfree(schemes); -} - -/* - * Return corresponding damos_action for the given dbgfs input for a scheme - * action if the input is valid, negative error code otherwise. - */ -static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) -{ - switch (dbgfs_action) { - case 0: - return DAMOS_WILLNEED; - case 1: - return DAMOS_COLD; - case 2: - return DAMOS_PAGEOUT; - case 3: - return DAMOS_HUGEPAGE; - case 4: - return DAMOS_NOHUGEPAGE; - case 5: - return DAMOS_STAT; - default: - return -EINVAL; - } -} - -/* - * Converts a string into an array of struct damos pointers - * - * Returns an array of struct damos pointers that converted if the conversion - * success, or NULL otherwise. - */ -static struct damos **str_to_schemes(const char *str, ssize_t len, - ssize_t *nr_schemes) -{ - struct damos *scheme, **schemes; - const int max_nr_schemes = 256; - int pos = 0, parsed, ret; - unsigned int action_input; - enum damos_action action; - - schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), - GFP_KERNEL); - if (!schemes) - return NULL; - - *nr_schemes = 0; - while (pos < len && *nr_schemes < max_nr_schemes) { - struct damos_access_pattern pattern = {}; - struct damos_quota quota = {}; - struct damos_watermarks wmarks; - - ret = sscanf(&str[pos], - "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &pattern.min_sz_region, &pattern.max_sz_region, - &pattern.min_nr_accesses, - &pattern.max_nr_accesses, - &pattern.min_age_region, - &pattern.max_age_region, - &action_input, "a.ms, - "a.sz, "a.reset_interval, - "a.weight_sz, "a.weight_nr_accesses, - "a.weight_age, &wmarks.metric, - &wmarks.interval, &wmarks.high, &wmarks.mid, - &wmarks.low, &parsed); - if (ret != 18) - break; - action = dbgfs_scheme_action_to_damos_action(action_input); - if ((int)action < 0) - goto fail; - - if (pattern.min_sz_region > pattern.max_sz_region || - pattern.min_nr_accesses > pattern.max_nr_accesses || - pattern.min_age_region > pattern.max_age_region) - goto fail; - - if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || - wmarks.mid < wmarks.low) - goto fail; - - pos += parsed; - scheme = damon_new_scheme(&pattern, action, 0, "a, - &wmarks, NUMA_NO_NODE); - if (!scheme) - goto fail; - - schemes[*nr_schemes] = scheme; - *nr_schemes += 1; - } - return schemes; -fail: - free_schemes_arr(schemes, *nr_schemes); - return NULL; -} - -static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - struct damos **schemes; - ssize_t nr_schemes = 0, ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - schemes = str_to_schemes(kbuf, count, &nr_schemes); - if (!schemes) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - damon_set_schemes(ctx, schemes, nr_schemes); - ret = count; - nr_schemes = 0; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - free_schemes_arr(schemes, nr_schemes); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) -{ - struct damon_target *t; - int id; - int written = 0; - int rc; - - damon_for_each_target(t, ctx) { - if (damon_target_has_pid(ctx)) - /* Show pid numbers to debugfs users */ - id = pid_vnr(t->pid); - else - /* Show 42 for physical address space, just for fun */ - id = 42; - - rc = scnprintf(&buf[written], len - written, "%d ", id); - if (!rc) - return -ENOMEM; - written += rc; - } - if (written) - written -= 1; - written += scnprintf(&buf[written], len - written, "\n"); - return written; -} - -static ssize_t dbgfs_target_ids_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - ssize_t len; - char ids_buf[320]; - - mutex_lock(&ctx->kdamond_lock); - len = sprint_target_ids(ctx, ids_buf, 320); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - return len; - - return simple_read_from_buffer(buf, count, ppos, ids_buf, len); -} - -/* - * Converts a string into an integers array - * - * Returns an array of integers array if the conversion success, or NULL - * otherwise. - */ -static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) -{ - int *array; - const int max_nr_ints = 32; - int nr; - int pos = 0, parsed, ret; - - *nr_ints = 0; - array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); - if (!array) - return NULL; - while (*nr_ints < max_nr_ints && pos < len) { - ret = sscanf(&str[pos], "%d%n", &nr, &parsed); - pos += parsed; - if (ret != 1) - break; - array[*nr_ints] = nr; - *nr_ints += 1; - } - - return array; -} - -static void dbgfs_put_pids(struct pid **pids, int nr_pids) -{ - int i; - - for (i = 0; i < nr_pids; i++) - put_pid(pids[i]); -} - -/* - * Converts a string into an struct pid pointers array - * - * Returns an array of struct pid pointers if the conversion success, or NULL - * otherwise. - */ -static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) -{ - int *ints; - ssize_t nr_ints; - struct pid **pids; - - *nr_pids = 0; - - ints = str_to_ints(str, len, &nr_ints); - if (!ints) - return NULL; - - pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); - if (!pids) - goto out; - - for (; *nr_pids < nr_ints; (*nr_pids)++) { - pids[*nr_pids] = find_get_pid(ints[*nr_pids]); - if (!pids[*nr_pids]) { - dbgfs_put_pids(pids, *nr_pids); - kfree(ints); - kfree(pids); - return NULL; - } - } - -out: - kfree(ints); - return pids; -} - -/* - * dbgfs_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @nr_targets: number of targets - * @pids: array of target pids (size is same to @nr_targets) - * - * This function should not be called while the kdamond is running. @pids is - * ignored if the context is not configured to have pid in each target. On - * failure, reference counts of all pids in @pids are decremented. - * - * Return: 0 on success, negative error code otherwise. - */ -static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, - struct pid **pids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) - put_pid(t->pid); - damon_destroy_target(t); - } - - for (i = 0; i < nr_targets; i++) { - t = damon_new_target(); - if (!t) { - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - if (damon_target_has_pid(ctx)) - dbgfs_put_pids(pids, nr_targets); - return -ENOMEM; - } - if (damon_target_has_pid(ctx)) - t->pid = pids[i]; - damon_add_target(ctx, t); - } - - return 0; -} - -static ssize_t dbgfs_target_ids_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - bool id_is_pid = true; - char *kbuf; - struct pid **target_pids = NULL; - ssize_t nr_targets; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - if (!strncmp(kbuf, "paddr\n", count)) { - id_is_pid = false; - nr_targets = 1; - } - - if (id_is_pid) { - target_pids = str_to_pids(kbuf, count, &nr_targets); - if (!target_pids) { - ret = -ENOMEM; - goto out; - } - } - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - if (id_is_pid) - dbgfs_put_pids(target_pids, nr_targets); - ret = -EBUSY; - goto unlock_out; - } - - /* remove previously set targets */ - dbgfs_set_targets(ctx, 0, NULL); - if (!nr_targets) { - ret = count; - goto unlock_out; - } - - /* Configure the context for the address space type */ - if (id_is_pid) - ret = damon_select_ops(ctx, DAMON_OPS_VADDR); - else - ret = damon_select_ops(ctx, DAMON_OPS_PADDR); - if (ret) - goto unlock_out; - - ret = dbgfs_set_targets(ctx, nr_targets, target_pids); - if (!ret) - ret = count; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(target_pids); -out: - kfree(kbuf); - return ret; -} - -static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r; - int target_idx = 0; - int written = 0; - int rc; - - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - rc = scnprintf(&buf[written], len - written, - "%d %lu %lu\n", - target_idx, r->ar.start, r->ar.end); - if (!rc) - return -ENOMEM; - written += rc; - } - target_idx++; - } - return written; -} - -static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - mutex_unlock(&ctx->kdamond_lock); - len = -EBUSY; - goto out; - } - - len = sprint_init_regions(ctx, kbuf, count); - mutex_unlock(&ctx->kdamond_lock); - if (len < 0) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int add_init_region(struct damon_ctx *c, int target_idx, - struct damon_addr_range *ar) -{ - struct damon_target *t; - struct damon_region *r, *prev; - unsigned long idx = 0; - int rc = -EINVAL; - - if (ar->start >= ar->end) - return -EINVAL; - - damon_for_each_target(t, c) { - if (idx++ == target_idx) { - r = damon_new_region(ar->start, ar->end); - if (!r) - return -ENOMEM; - damon_add_region(r, t); - if (damon_nr_regions(t) > 1) { - prev = damon_prev_region(r); - if (prev->ar.end > r->ar.start) { - damon_destroy_region(r, t); - return -EINVAL; - } - } - rc = 0; - } - } - return rc; -} - -static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) -{ - struct damon_target *t; - struct damon_region *r, *next; - int pos = 0, parsed, ret; - int target_idx; - struct damon_addr_range ar; - int err; - - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - - while (pos < len) { - ret = sscanf(&str[pos], "%d %lu %lu%n", - &target_idx, &ar.start, &ar.end, &parsed); - if (ret != 3) - break; - err = add_init_region(c, target_idx, &ar); - if (err) - goto fail; - pos += parsed; - } - - return 0; - -fail: - damon_for_each_target(t, c) { - damon_for_each_region_safe(r, next, t) - damon_destroy_region(r, t); - } - return err; -} - -static ssize_t dbgfs_init_regions_write(struct file *file, - const char __user *buf, size_t count, - loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t ret = count; - int err; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) { - ret = -EBUSY; - goto unlock_out; - } - - err = set_init_regions(ctx, kbuf, ret); - if (err) - ret = err; - -unlock_out: - mutex_unlock(&ctx->kdamond_lock); - kfree(kbuf); - return ret; -} - -static ssize_t dbgfs_kdamond_pid_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - struct damon_ctx *ctx = file->private_data; - char *kbuf; - ssize_t len; - - kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); - if (!kbuf) - return -ENOMEM; - - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); - else - len = scnprintf(kbuf, count, "none\n"); - mutex_unlock(&ctx->kdamond_lock); - if (!len) - goto out; - len = simple_read_from_buffer(buf, count, ppos, kbuf, len); - -out: - kfree(kbuf); - return len; -} - -static int damon_dbgfs_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - - file->private_data = inode->i_private; - - return nonseekable_open(inode, file); -} - -static const struct file_operations attrs_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_attrs_read, - .write = dbgfs_attrs_write, -}; - -static const struct file_operations schemes_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_schemes_read, - .write = dbgfs_schemes_write, -}; - -static const struct file_operations target_ids_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_target_ids_read, - .write = dbgfs_target_ids_write, -}; - -static const struct file_operations init_regions_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_init_regions_read, - .write = dbgfs_init_regions_write, -}; - -static const struct file_operations kdamond_pid_fops = { - .open = damon_dbgfs_open, - .read = dbgfs_kdamond_pid_read, -}; - -static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) -{ - const char * const file_names[] = {"attrs", "schemes", "target_ids", - "init_regions", "kdamond_pid"}; - const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, - &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; - int i; - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); -} - -static void dbgfs_before_terminate(struct damon_ctx *ctx) -{ - struct damon_target *t, *next; - - if (!damon_target_has_pid(ctx)) - return; - - mutex_lock(&ctx->kdamond_lock); - damon_for_each_target_safe(t, next, ctx) { - put_pid(t->pid); - damon_destroy_target(t); - } - mutex_unlock(&ctx->kdamond_lock); -} - -static struct damon_ctx *dbgfs_new_ctx(void) -{ - struct damon_ctx *ctx; - - ctx = damon_new_ctx(); - if (!ctx) - return NULL; - - if (damon_select_ops(ctx, DAMON_OPS_VADDR) && - damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return NULL; - } - ctx->callback.before_terminate = dbgfs_before_terminate; - return ctx; -} - -static void dbgfs_destroy_ctx(struct damon_ctx *ctx) -{ - damon_destroy_ctx(ctx); -} - -static ssize_t damon_dbgfs_deprecated_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE; - - return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf)); -} - -/* - * Make a context of @name and create a debugfs directory for it. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Returns 0 on success, negative error code otherwise. - */ -static int dbgfs_mk_context(char *name) -{ - struct dentry *root, **new_dirs, *new_dir; - struct damon_ctx **new_ctxs, *new_ctx; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_ctxs) - return -ENOMEM; - dbgfs_ctxs = new_ctxs; - - new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * - (dbgfs_nr_ctxs + 1), GFP_KERNEL); - if (!new_dirs) - return -ENOMEM; - dbgfs_dirs = new_dirs; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - new_dir = debugfs_create_dir(name, root); - /* Below check is required for a potential duplicated name case */ - if (IS_ERR(new_dir)) - return PTR_ERR(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; - - new_ctx = dbgfs_new_ctx(); - if (!new_ctx) { - debugfs_remove(new_dir); - dbgfs_dirs[dbgfs_nr_ctxs] = NULL; - return -ENOMEM; - } - - dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; - dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], - dbgfs_ctxs[dbgfs_nr_ctxs]); - dbgfs_nr_ctxs++; - - return 0; -} - -static ssize_t dbgfs_mk_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - char *ctx_name; - ssize_t ret; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_mk_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -/* - * Remove a context of @name and its debugfs directory. - * - * This function should be called while holding damon_dbgfs_lock. - * - * Return 0 on success, negative error code otherwise. - */ -static int dbgfs_rm_context(char *name) -{ - struct dentry *root, *dir, **new_dirs; - struct inode *inode; - struct damon_ctx **new_ctxs; - int i, j; - int ret = 0; - - if (damon_nr_running_ctxs()) - return -EBUSY; - - root = dbgfs_dirs[0]; - if (!root) - return -ENOENT; - - dir = debugfs_lookup(name, root); - if (!dir) - return -ENOENT; - - inode = d_inode(dir); - if (!S_ISDIR(inode->i_mode)) { - ret = -EINVAL; - goto out_dput; - } - - new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), - GFP_KERNEL); - if (!new_dirs) { - ret = -ENOMEM; - goto out_dput; - } - - new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), - GFP_KERNEL); - if (!new_ctxs) { - ret = -ENOMEM; - goto out_new_dirs; - } - - for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { - if (dbgfs_dirs[i] == dir) { - debugfs_remove(dbgfs_dirs[i]); - dbgfs_destroy_ctx(dbgfs_ctxs[i]); - continue; - } - new_dirs[j] = dbgfs_dirs[i]; - new_ctxs[j++] = dbgfs_ctxs[i]; - } - - kfree(dbgfs_dirs); - kfree(dbgfs_ctxs); - - dbgfs_dirs = new_dirs; - dbgfs_ctxs = new_ctxs; - dbgfs_nr_ctxs--; - - goto out_dput; - -out_new_dirs: - kfree(new_dirs); -out_dput: - dput(dir); - return ret; -} - -static ssize_t dbgfs_rm_context_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - char *kbuf; - ssize_t ret; - char *ctx_name; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - ctx_name = kmalloc(count + 1, GFP_KERNEL); - if (!ctx_name) { - kfree(kbuf); - return -ENOMEM; - } - - /* Trim white space */ - if (sscanf(kbuf, "%s", ctx_name) != 1) { - ret = -EINVAL; - goto out; - } - - mutex_lock(&damon_dbgfs_lock); - ret = dbgfs_rm_context(ctx_name); - if (!ret) - ret = count; - mutex_unlock(&damon_dbgfs_lock); - -out: - kfree(kbuf); - kfree(ctx_name); - return ret; -} - -static ssize_t dbgfs_monitor_on_read(struct file *file, - char __user *buf, size_t count, loff_t *ppos) -{ - char monitor_on_buf[5]; - bool monitor_on = damon_nr_running_ctxs() != 0; - int len; - - len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); - - return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); -} - -static ssize_t dbgfs_monitor_on_write(struct file *file, - const char __user *buf, size_t count, loff_t *ppos) -{ - ssize_t ret; - char *kbuf; - - kbuf = user_input_str(buf, count, ppos); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - - /* Remove white space */ - if (sscanf(kbuf, "%s", kbuf) != 1) { - kfree(kbuf); - return -EINVAL; - } - - mutex_lock(&damon_dbgfs_lock); - if (!strncmp(kbuf, "on", count)) { - int i; - - for (i = 0; i < dbgfs_nr_ctxs; i++) { - if (damon_targets_empty(dbgfs_ctxs[i])) { - kfree(kbuf); - mutex_unlock(&damon_dbgfs_lock); - return -EINVAL; - } - } - ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); - } else if (!strncmp(kbuf, "off", count)) { - ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); - } else { - ret = -EINVAL; - } - mutex_unlock(&damon_dbgfs_lock); - - if (!ret) - ret = count; - kfree(kbuf); - return ret; -} - -static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file) -{ - damon_dbgfs_warn_deprecation(); - return nonseekable_open(inode, file); -} - -static const struct file_operations deprecated_fops = { - .read = damon_dbgfs_deprecated_read, -}; - -static const struct file_operations mk_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_mk_context_write, -}; - -static const struct file_operations rm_contexts_fops = { - .open = damon_dbgfs_static_file_open, - .write = dbgfs_rm_context_write, -}; - -static const struct file_operations monitor_on_fops = { - .open = damon_dbgfs_static_file_open, - .read = dbgfs_monitor_on_read, - .write = dbgfs_monitor_on_write, -}; - -static int __init __damon_dbgfs_init(void) -{ - struct dentry *dbgfs_root; - const char * const file_names[] = {"mk_contexts", "rm_contexts", - "monitor_on_DEPRECATED", "DEPRECATED"}; - const struct file_operations *fops[] = {&mk_contexts_fops, - &rm_contexts_fops, &monitor_on_fops, &deprecated_fops}; - int i; - - dbgfs_root = debugfs_create_dir("damon", NULL); - - for (i = 0; i < ARRAY_SIZE(file_names); i++) - debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, - fops[i]); - dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - - dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); - if (!dbgfs_dirs) { - debugfs_remove(dbgfs_root); - return -ENOMEM; - } - dbgfs_dirs[0] = dbgfs_root; - - return 0; -} - -/* - * Functions for the initialization - */ - -static int __init damon_dbgfs_init(void) -{ - int rc = -ENOMEM; - - mutex_lock(&damon_dbgfs_lock); - dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); - if (!dbgfs_ctxs) - goto out; - dbgfs_ctxs[0] = dbgfs_new_ctx(); - if (!dbgfs_ctxs[0]) { - kfree(dbgfs_ctxs); - goto out; - } - dbgfs_nr_ctxs = 1; - - rc = __damon_dbgfs_init(); - if (rc) { - kfree(dbgfs_ctxs[0]); - kfree(dbgfs_ctxs); - pr_err("%s: dbgfs init failed\n", __func__); - } - -out: - mutex_unlock(&damon_dbgfs_lock); - return rc; -} - -module_init(damon_dbgfs_init); - -#include "tests/dbgfs-kunit.h" diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a9ff35341d65..c834aa217835 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -198,7 +198,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static bool __damos_pa_filter_out(struct damos_filter *filter, +static bool damos_pa_filter_match(struct damos_filter *filter, struct folio *folio) { bool matched = false; @@ -236,14 +236,18 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; + if (scheme->core_filters_allowed) + return false; + damos_for_each_filter(filter, scheme) { - if (__damos_pa_filter_out(filter, folio)) - return true; + if (damos_pa_filter_match(filter, folio)) + return !filter->allow; } return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); @@ -258,7 +262,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) } } if (install_young_filter) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true); + filter = damos_new_filter( + DAMOS_FILTER_TYPE_YOUNG, true, false); if (!filter) return 0; damos_add_filter(s, filter); @@ -272,6 +277,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s) if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -292,7 +299,8 @@ put_folio: } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed) + struct damon_region *r, struct damos *s, bool mark_accessed, + unsigned long *sz_filter_passed) { unsigned long addr, applied = 0; @@ -304,6 +312,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (mark_accessed) folio_mark_accessed(folio); @@ -317,15 +327,17 @@ put_folio: } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true); + return damon_pa_mark_accessed_or_deactivate(r, s, true, + sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s) + struct damos *s, unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false); + return damon_pa_mark_accessed_or_deactivate(r, s, false, + sz_filter_passed); } static unsigned int __damon_pa_migrate_folio_list( @@ -449,7 +461,8 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list, return nr_migrated; } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) +static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) { unsigned long addr, applied; LIST_HEAD(folio_list); @@ -462,6 +475,8 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s) if (damos_pa_filter_out(s, folio)) goto put_folio; + else + *sz_filter_passed += folio_size(folio); if (!folio_isolate_lru(folio)) goto put_folio; @@ -474,23 +489,57 @@ put_folio: return applied * PAGE_SIZE; } +static bool damon_pa_scheme_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_filter(f, s) + return true; + return false; +} + +static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + unsigned long addr; + LIST_HEAD(folio_list); + + if (!damon_pa_scheme_has_filter(s)) + return 0; + + addr = r->ar.start; + while (addr < r->ar.end) { + struct folio *folio = damon_get_folio(PHYS_PFN(addr)); + + if (!folio) { + addr += PAGE_SIZE; + continue; + } + + if (!damos_pa_filter_out(s, folio)) + *sz_filter_passed += folio_size(folio); + addr += folio_size(folio); + folio_put(folio); + } + return 0; +} static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme); + return damon_pa_pageout(r, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme); + return damon_pa_mark_accessed(r, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme); + return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme); + return damon_pa_migrate(r, scheme, sz_filter_passed); case DAMOS_STAT: - break; + return damon_pa_stat(r, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 9e0077a9404e..a675150965e0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -221,7 +221,7 @@ static int damon_reclaim_apply_parameters(void) } if (skip_anon) { - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); if (!filter) goto out; damos_add_filter(scheme, filter); diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 9a18f3c535d3..70d84bdc9f5f 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only); - -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx); - -bool damos_sysfs_regions_upd_done(void); - -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, + bool total_bytes_only, unsigned long sz_filter_passed); int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_sysfs_schemes *sysfs_schemes); int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b095457380b5..98f93ae9f59e 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region { struct damon_addr_range ar; unsigned int nr_accesses; unsigned int age; + unsigned long sz_filter_passed; struct list_head list; }; @@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, return sysfs_emit(buf, "%u\n", region->age); } +static ssize_t sz_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->sz_filter_passed); +} + static void damon_sysfs_scheme_region_release(struct kobject *kobj) { struct damon_sysfs_scheme_region *region = container_of(kobj, @@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = static struct kobj_attribute damon_sysfs_scheme_region_age_attr = __ATTR_RO_MODE(age, 0400); +static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr = + __ATTR_RO_MODE(sz_filter_passed, 0400); + static struct attribute *damon_sysfs_scheme_region_attrs[] = { &damon_sysfs_scheme_region_start_attr.attr, &damon_sysfs_scheme_region_end_attr.attr, &damon_sysfs_scheme_region_nr_accesses_attr.attr, &damon_sysfs_scheme_region_age_attr.attr, + &damon_sysfs_scheme_region_sz_filter_passed_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); @@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ -/* - * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update - * status - * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. - * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. - * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. - * - * Each DAMON-based operation scheme (&struct damos) has its own apply - * interval, and we need to expose the scheme tried regions based on only - * single snapshot. For this, we keep the tried regions update status for each - * scheme. The status becomes 'idle' at the beginning. - * - * Once the tried regions update request is received, the request handling - * start function (damon_sysfs_scheme_update_regions_start()) sets the status - * of all schemes as 'idle' again, and register ->before_damos_apply() - * callback. - * - * Then, the first followup ->before_damos_apply() callback - * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first - * ->after_sampling() or ->after_aggregation() callback - * (damon_sysfs_cmd_request_callback()) after the call is called only after - * the scheme is completely applied to the given snapshot. Hence the callback - * knows the situation by showing 'started' status, and sets the status as - * 'finished'. Then, damon_sysfs_before_damos_apply() understands the - * situation by showing the 'finished' status and do nothing. - * - * If DAMOS is not applied to any region due to any reasons including the - * access pattern, the watermarks, the quotas, and the filters, - * ->before_damos_apply() will not be called back. Until the situation is - * changed, the update will not be finished. To avoid this, - * damon_sysfs_after_sampling() set the status as 'finished' if more than two - * apply intervals of the scheme is passed while the state is 'idle'. - * - * Finally, the tried regions request handling finisher function - * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. - */ -enum damos_sysfs_regions_upd_status { - DAMOS_TRIED_REGIONS_UPD_IDLE, - DAMOS_TRIED_REGIONS_UPD_STARTED, - DAMOS_TRIED_REGIONS_UPD_FINISHED, -}; - struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; - enum damos_sysfs_regions_upd_status upd_status; - unsigned long upd_timeout_jiffies; }; static struct damon_sysfs_scheme_regions * @@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; - regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -233,6 +202,7 @@ struct damon_sysfs_stats { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; @@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj, return sysfs_emit(buf, "%lu\n", stats->sz_applied); } +static ssize_t sz_ops_filter_passed_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed); +} + static ssize_t qt_exceeds_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = __ATTR_RO_MODE(sz_applied, 0400); +static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr = + __ATTR_RO_MODE(sz_ops_filter_passed, 0400); + static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = __ATTR_RO_MODE(qt_exceeds, 0400); @@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = { &damon_sysfs_stats_sz_tried_attr.attr, &damon_sysfs_stats_nr_applied_attr.attr, &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_sz_ops_filter_passed_attr.attr, &damon_sysfs_stats_qt_exceeds_attr.attr, NULL, }; @@ -330,6 +313,7 @@ struct damon_sysfs_scheme_filter { struct kobject kobj; enum damos_filter_type type; bool matching; + bool allow; char *memcg_path; struct damon_addr_range addr_range; int target_idx; @@ -402,6 +386,30 @@ static ssize_t matching_store(struct kobject *kobj, return count; } +static ssize_t allow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N'); +} + +static ssize_t allow_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + bool allow; + int err = kstrtobool(buf, &allow); + + if (err) + return err; + + filter->allow = allow; + return count; +} + static ssize_t memcg_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -499,6 +507,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr = static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = __ATTR_RW_MODE(matching, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr = + __ATTR_RW_MODE(allow, 0600); + static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = __ATTR_RW_MODE(memcg_path, 0600); @@ -514,6 +525,7 @@ static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr = static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, + &damon_sysfs_scheme_filter_allow_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, &damon_sysfs_scheme_filter_addr_start_attr.attr, &damon_sysfs_scheme_filter_addr_end_attr.attr, @@ -1918,7 +1930,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme, sysfs_filters->filters_arr[i]; struct damos_filter *filter = damos_new_filter(sysfs_filter->type, - sysfs_filter->matching); + sysfs_filter->matching, + sysfs_filter->allow); int err; if (!filter) @@ -2122,32 +2135,32 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->sz_tried = scheme->stat.sz_tried; sysfs_stats->nr_applied = scheme->stat.nr_applied; sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->sz_ops_filter_passed = + scheme->stat.sz_ops_filter_passed; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } } -/* - * damon_sysfs_schemes that need to update its schemes regions dir. Protected - * by damon_sysfs_lock - */ -static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; -static int damon_sysfs_schemes_region_idx; -static bool damos_regions_upd_total_bytes_only; - -/* - * DAMON callback that called before damos apply. While this callback is - * registered, damon_sysfs_lock should be held to ensure the regions - * directories exist. +/** + * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir. + * @sysfs_schemes: Schemes directory to populate regions directory. + * @ctx: Corresponding DAMON context. + * @t: DAMON target of @r. + * @r: DAMON region to populate the directory for. + * @s: Corresponding scheme. + * @total_bytes_only: Whether the request is for bytes update only. + * @sz_filter_passed: Bytes of @r that passed filters of @s. + * + * Called from DAMOS walk callback while holding damon_sysfs_lock. */ -static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - struct damos *s) +void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s, bool total_bytes_only, + unsigned long sz_filter_passed) { struct damos *scheme; struct damon_sysfs_scheme_regions *sysfs_regions; struct damon_sysfs_scheme_region *region; - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; int schemes_idx = 0; damon_for_each_scheme(scheme, ctx) { @@ -2158,152 +2171,40 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, /* user could have removed the scheme sysfs dir */ if (schemes_idx >= sysfs_schemes->nr) - return 0; + return; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) - return 0; - if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; - if (damos_regions_upd_total_bytes_only) - return 0; + if (total_bytes_only) + return; region = damon_sysfs_scheme_region_alloc(r); if (!region) - return 0; + return; + region->sz_filter_passed = sz_filter_passed; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, &damon_sysfs_scheme_region_ktype, &sysfs_regions->kobj, "%d", - damon_sysfs_schemes_region_idx++)) { + sysfs_regions->nr_regions++)) { kobject_put(®ion->kobj); } - return 0; -} - -/* - * DAMON callback that called after each accesses sampling. While this - * callback is registered, damon_sysfs_lock should be held to ensure the - * regions directories exist. - */ -void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status == - DAMOS_TRIED_REGIONS_UPD_STARTED || - time_after(jiffies, - sysfs_regions->upd_timeout_jiffies)) - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - } } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_clear_regions( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) + struct damon_sysfs_schemes *sysfs_schemes) { - struct damos *scheme; - int schemes_idx = 0; + int i; - damon_for_each_scheme(scheme, ctx) { + for (i = 0; i < sysfs_schemes->nr; i++) { struct damon_sysfs_scheme *sysfs_scheme; - /* user could have removed the scheme sysfs dir */ - if (schemes_idx >= sysfs_schemes->nr) - break; - - sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + sysfs_scheme = sysfs_schemes->schemes_arr[i]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } - -static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx) -{ - struct damos *scheme; - int i = 0; - - damon_for_each_scheme(scheme, ctx) { - if (i == n) - return scheme; - i++; - } - return NULL; -} - -static void damos_tried_regions_init_upd_status( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - int i; - struct damos *scheme; - struct damon_sysfs_scheme_regions *sysfs_regions; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - scheme = damos_sysfs_nth_scheme(i, ctx); - if (!scheme) { - sysfs_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_FINISHED; - continue; - } - sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; - sysfs_regions->upd_timeout_jiffies = jiffies + - 2 * usecs_to_jiffies(scheme->apply_interval_us ? - scheme->apply_interval_us : - ctx->attrs.aggr_interval); - } -} - -/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_update_regions_start( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx, bool total_bytes_only) -{ - damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); - damon_sysfs_schemes_for_damos_callback = sysfs_schemes; - damos_tried_regions_init_upd_status(sysfs_schemes, ctx); - damos_regions_upd_total_bytes_only = total_bytes_only; - ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; - return 0; -} - -bool damos_sysfs_regions_upd_done(void) -{ - struct damon_sysfs_schemes *sysfs_schemes = - damon_sysfs_schemes_for_damos_callback; - struct damon_sysfs_scheme_regions *sysfs_regions; - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; - if (sysfs_regions->upd_status != - DAMOS_TRIED_REGIONS_UPD_FINISHED) - return false; - } - return true; -} - -/* - * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller - * should unlock damon_sysfs_lock which held before - * damon_sysfs_schemes_update_regions_start() - */ -int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) -{ - damon_sysfs_schemes_for_damos_callback = NULL; - ctx->callback.before_damos_apply = NULL; - damon_sysfs_schemes_region_idx = 0; - return 0; -} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 58145d59881d..deeab04d3b46 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1181,25 +1181,9 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx, return 0; } -static bool damon_sysfs_schemes_regions_updating; - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - struct damon_sysfs_kdamond *kdamond; - enum damon_sysfs_cmd cmd; - - /* damon_sysfs_schemes_update_regions_stop() might not yet called */ - kdamond = damon_sysfs_cmd_request.kdamond; - cmd = damon_sysfs_cmd_request.cmd; - if (kdamond && ctx == kdamond->damon_ctx && - (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || - cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) && - damon_sysfs_schemes_regions_updating) { - damon_sysfs_schemes_update_regions_stop(ctx); - damon_sysfs_schemes_regions_updating = false; - mutex_unlock(&damon_sysfs_lock); - } if (!damon_target_has_pid(ctx)) return; @@ -1214,57 +1198,24 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes stats of specific kdamond and update the * related values for sysfs files. This function should be called from DAMON - * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON - * contexts-internal data and DAMON sysfs variables. + * worker thread,to safely access the DAMON contexts-internal data. Caller + * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is + * not NULL but a valid pointer, to safely access DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_stats(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damon_sysfs_schemes_update_stats( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } -static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx, - total_bytes_only); -} - -static int damon_sysfs_upd_schemes_regions_stop( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_update_regions_stop(ctx); -} - -static int damon_sysfs_clear_schemes_regions( - struct damon_sysfs_kdamond *kdamond) -{ - struct damon_ctx *ctx = kdamond->damon_ctx; - - if (!ctx) - return -EINVAL; - return damon_sysfs_schemes_clear_regions( - kdamond->contexts->contexts_arr[0]->schemes, ctx); -} - static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1318,9 +1269,9 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) return err; } -static int damon_sysfs_commit_schemes_quota_goals( - struct damon_sysfs_kdamond *sysfs_kdamond) +static int damon_sysfs_commit_schemes_quota_goals(void *data) { + struct damon_sysfs_kdamond *sysfs_kdamond = data; struct damon_ctx *ctx; struct damon_sysfs_context *sysfs_ctx; @@ -1338,20 +1289,18 @@ static int damon_sysfs_commit_schemes_quota_goals( /* * damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas * sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. + * @data: The kobject wrapper that associated to the kdamond thread. * * This function reads the schemes' effective quotas of specific kdamond and * update the related values for sysfs files. This function should be called * from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the * DAMON contexts-internal data and DAMON sysfs variables. */ -static int damon_sysfs_upd_schemes_effective_quotas( - struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_upd_schemes_effective_quotas(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *ctx = kdamond->damon_ctx; - if (!ctx) - return -EINVAL; damos_sysfs_update_effective_quotas( kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; @@ -1371,67 +1320,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, bool after_aggregation) { struct damon_sysfs_kdamond *kdamond; - bool total_bytes_only = false; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!damon_sysfs_schemes_regions_updating && - !mutex_trylock(&damon_sysfs_lock)) + if (!mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) goto out; switch (damon_sysfs_cmd_request.cmd) { - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: - err = damon_sysfs_upd_schemes_stats(kdamond); - break; case DAMON_SYSFS_CMD_COMMIT: if (!after_aggregation) goto out; err = damon_sysfs_commit_input(kdamond); break; - case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: - err = damon_sysfs_commit_schemes_quota_goals(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: - total_bytes_only = true; - fallthrough; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: - if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond, - total_bytes_only); - if (!err) { - damon_sysfs_schemes_regions_updating = true; - goto keep_lock_out; - } - } else { - damos_sysfs_mark_finished_regions_updates(c); - /* - * Continue regions updating if DAMON is till - * active and the update for all schemes is not - * finished. - */ - if (active && !damos_sysfs_regions_upd_done()) - goto keep_lock_out; - err = damon_sysfs_upd_schemes_regions_stop(kdamond); - damon_sysfs_schemes_regions_updating = false; - } - break; - case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: - err = damon_sysfs_clear_schemes_regions(kdamond); - break; - case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: - err = damon_sysfs_upd_schemes_effective_quotas(kdamond); - break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - if (!damon_sysfs_schemes_regions_updating) - mutex_unlock(&damon_sysfs_lock); -keep_lock_out: + mutex_unlock(&damon_sysfs_lock); return err; } @@ -1525,6 +1434,58 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) */ } +static int damon_sysfs_damon_call(int (*fn)(void *data), + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_call_control call_control = {}; + + if (!kdamond->damon_ctx) + return -EINVAL; + call_control.fn = fn; + call_control.data = kdamond; + return damon_call(kdamond->damon_ctx, &call_control); +} + +struct damon_sysfs_schemes_walk_data { + struct damon_sysfs_kdamond *sysfs_kdamond; + bool total_bytes_only; +}; + +/* populate the region directory */ +static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s, unsigned long sz_filter_passed) +{ + struct damon_sysfs_schemes_walk_data *walk_data = data; + struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond; + + damos_sysfs_populate_region_dir( + sysfs_kdamond->contexts->contexts_arr[0]->schemes, + ctx, t, r, s, walk_data->total_bytes_only, + sz_filter_passed); +} + +static int damon_sysfs_update_schemes_tried_regions( + struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only) +{ + struct damon_sysfs_schemes_walk_data walk_data = { + .sysfs_kdamond = sysfs_kdamond, + .total_bytes_only = total_bytes_only, + }; + struct damos_walk_control control = { + .walk_fn = damon_sysfs_schemes_tried_regions_upd_one, + .data = &walk_data, + }; + struct damon_ctx *ctx = sysfs_kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + + damon_sysfs_schemes_clear_regions( + sysfs_kdamond->contexts->contexts_arr[0]->schemes); + return damos_walk(ctx, &control); +} + /* * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. * @cmd: The command to handle. @@ -1543,12 +1504,29 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, { bool need_wait = true; - /* Handle commands that doesn't access DAMON context-internal data */ switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); case DAMON_SYSFS_CMD_OFF: return damon_sysfs_turn_damon_off(kdamond); + case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: + return damon_sysfs_damon_call( + damon_sysfs_commit_schemes_quota_goals, + kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_stats, kdamond); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + return damon_sysfs_update_schemes_tried_regions(kdamond, true); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + return damon_sysfs_update_schemes_tried_regions(kdamond, false); + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes); + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS: + return damon_sysfs_damon_call( + damon_sysfs_upd_schemes_effective_quotas, + kdamond); default: break; } diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig index a73be044fc9b..36a450f57b58 100644 --- a/mm/damon/tests/.kunitconfig +++ b/mm/damon/tests/.kunitconfig @@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y CONFIG_SYSFS=y CONFIG_DAMON_SYSFS=y CONFIG_DAMON_SYSFS_KUNIT_TEST=y - -# for DAMON debugfs interface -CONFIG_DEBUG_FS=y -CONFIG_DAMON_PADDR=y -CONFIG_DAMON_DBGFS_DEPRECATED=y -CONFIG_DAMON_DBGFS=y -CONFIG_DAMON_DBGFS_KUNIT_TEST=y diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index cf22e09a3507..532c6a6f21f9 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test) { struct damos_filter *filter; - filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false); KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); KUNIT_EXPECT_EQ(test, filter->matching, true); KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); @@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test) struct damon_region *r, *r2; struct damos_filter *f; - f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true); + f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false); f->addr_range = (struct damon_addr_range){ .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; @@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h deleted file mode 100644 index 087e53f641a8..000000000000 --- a/mm/damon/tests/dbgfs-kunit.h +++ /dev/null @@ -1,173 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * DAMON Debugfs Interface Unit Tests - * - * Author: SeongJae Park <sj@kernel.org> - */ - -#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST - -#ifndef _DAMON_DBGFS_TEST_H -#define _DAMON_DBGFS_TEST_H - -#include <kunit/test.h> - -static void damon_dbgfs_test_str_to_ints(struct kunit *test) -{ - char *question; - int *answers; - int expected[] = {12, 35, 46}; - ssize_t nr_integers = 0, i; - - question = "123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "123abc"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123, answers[0]); - kfree(answers); - - question = "a123"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "12 35"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); - for (i = 0; i < nr_integers; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = "12 35 abc 46"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); - for (i = 0; i < 2; i++) - KUNIT_EXPECT_EQ(test, expected[i], answers[i]); - kfree(answers); - - question = ""; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); - - question = "\n"; - answers = str_to_ints(question, strlen(question), &nr_integers); - KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); - kfree(answers); -} - -static void damon_dbgfs_test_set_targets(struct kunit *test) -{ - struct damon_ctx *ctx = dbgfs_new_ctx(); - char buf[64]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - dbgfs_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - /* Make DAMON consider target has no pid */ - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_set_targets(ctx, 1, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - - dbgfs_set_targets(ctx, 0, NULL); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - - dbgfs_destroy_ctx(ctx); -} - -static void damon_dbgfs_test_set_init_regions(struct kunit *test) -{ - struct damon_ctx *ctx = damon_new_ctx(); - /* Each line represents one region in ``<target idx> <start> <end>`` */ - char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", - "1 10 20\n", - "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", - ""}; - /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", - "1 10 20\n", - "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", - ""}; - char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ - "1 10 20\n 1 14 26\n", /* regions overlap */ - "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ - char *input, *expect; - int i, rc; - char buf[256]; - - if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - kunit_skip(test, "PADDR not registered"); - } - - damon_select_ops(ctx, DAMON_OPS_PADDR); - - dbgfs_set_targets(ctx, 3, NULL); - - /* Put valid inputs and check the results */ - for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { - input = valid_inputs[i]; - expect = valid_expects[i]; - - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, 0); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, expect); - } - /* Put invalid inputs and check the return error code */ - for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { - input = invalid_inputs[i]; - pr_info("input: %s\n", input); - rc = set_init_regions(ctx, input, strnlen(input, 256)); - KUNIT_EXPECT_EQ(test, rc, -EINVAL); - - memset(buf, 0, 256); - sprint_init_regions(ctx, buf, 256); - - KUNIT_EXPECT_STREQ(test, (char *)buf, ""); - } - - dbgfs_set_targets(ctx, 0, NULL); - damon_destroy_ctx(ctx); -} - -static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_ints), - KUNIT_CASE(damon_dbgfs_test_set_targets), - KUNIT_CASE(damon_dbgfs_test_set_init_regions), - {}, -}; - -static struct kunit_suite damon_test_suite = { - .name = "damon-dbgfs", - .test_cases = damon_test_cases, -}; -kunit_test_suite(damon_test_suite); - -#endif /* _DAMON_DBGFS_TEST_H */ - -#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index b9fe3bc8472b..7cd944266a92 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) static struct mm_struct mm; struct damon_addr_range regions[3] = {0}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ - struct vm_area_struct vmas[] = { + static struct vm_area_struct vmas[] = { (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index b9eaa20b73b9..a6174f725bd7 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -655,7 +655,7 @@ static unsigned long damos_madvise(struct damon_target *target, static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *scheme) + struct damos *scheme, unsigned long *sz_filter_passed) { int madv_action; diff --git a/mm/debug.c b/mm/debug.c index 95b6ab809c0e..8d2acf432385 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -249,6 +249,77 @@ void dump_mm(const struct mm_struct *mm) } EXPORT_SYMBOL(dump_mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason) +{ + if (reason) + pr_warn("vmg %px dumped because: %s\n", vmg, reason); + + if (!vmg) { + pr_warn("vmg %px state: (NULL)\n", vmg); + return; + } + + pr_warn("vmg %px state: mm %px pgoff %lx\n" + "vmi %px [%lx,%lx)\n" + "prev %px next %px vma %px\n" + "start %lx end %lx flags %lx\n" + "file %px anon_vma %px policy %px\n" + "uffd_ctx %px\n" + "anon_name %px\n" + "merge_flags %x state %x\n", + vmg, vmg->mm, vmg->pgoff, + vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, + vmg->vmi ? vma_iter_end(vmg->vmi) : 0, + vmg->prev, vmg->next, vmg->vma, + vmg->start, vmg->end, vmg->flags, + vmg->file, vmg->anon_vma, vmg->policy, +#ifdef CONFIG_USERFAULTFD + vmg->uffd_ctx.ctx, +#else + (void *)0, +#endif + vmg->anon_name, + (int)vmg->merge_flags, (int)vmg->state); + + if (vmg->mm) { + pr_warn("vmg %px mm:\n", vmg); + dump_mm(vmg->mm); + } else { + pr_warn("vmg %px mm: (NULL)\n", vmg); + } + + if (vmg->vma) { + pr_warn("vmg %px vma:\n", vmg); + dump_vma(vmg->vma); + } else { + pr_warn("vmg %px vma: (NULL)\n", vmg); + } + + if (vmg->prev) { + pr_warn("vmg %px prev:\n", vmg); + dump_vma(vmg->prev); + } else { + pr_warn("vmg %px prev: (NULL)\n", vmg); + } + + if (vmg->next) { + pr_warn("vmg %px next:\n", vmg); + dump_vma(vmg->next); + } else { + pr_warn("vmg %px next: (NULL)\n", vmg); + } + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + if (vmg->vmi) { + pr_warn("vmg %px vmi:\n", vmg); + vma_iter_dump_tree(vmg->vmi); + } else { + pr_warn("vmg %px vmi: (NULL)\n", vmg); + } +#endif +} +EXPORT_SYMBOL(dump_vmg); + static bool page_init_poisoning __read_mostly = true; static int __init setup_vm_debug(char *str) diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index ce06b2884789..ff35b84a7b50 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size, #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) -void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +/* + * If no empty slot, handle that and return -ENOMEM. + */ +int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) { unsigned long slop, clen; char *p; @@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) if (clen > MAX_MAP_CHUNK - slop) clen = MAX_MAP_CHUNK - slop; p = early_memremap(src & PAGE_MASK, clen + slop); + if (!p) + return -ENOMEM; memcpy(dest, p + slop, clen); early_memunmap(p, clen + slop); dest += clen; src += clen; size -= clen; } + return 0; } #else /* CONFIG_MMU */ diff --git a/mm/execmem.c b/mm/execmem.c index 317b6a8d35be..e6c4f5076ca8 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -257,7 +257,6 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; - unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - start = (unsigned long)p; - end = start + alloc_size; - - vunmap_range(start, end); - - err = execmem_set_direct_map_valid(vm, false); - if (err) - goto err_free_mem; - - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, - PMD_SHIFT); + err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) - goto err_free_mem; + goto err_reset_direct_map; return 0; +err_reset_direct_map: + execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; @@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr) return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { diff --git a/mm/filemap.c b/mm/filemap.c index 4f476411a9a2..e9404290f2c6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -441,6 +441,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, EXPORT_SYMBOL(filemap_fdatawrite_range); /** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + +/** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * @@ -1464,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) } /** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - -/** * folio_unlock - Unlock a locked folio. * @folio: The folio. * @@ -1590,6 +1589,27 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void folio_end_dropbehind_write(struct folio *folio) +{ + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + if (folio->mapping) + folio_unmap_invalidate(folio->mapping, folio, 0); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1600,6 +1620,8 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); */ void folio_end_writeback(struct folio *folio) { + bool folio_dropbehind = false; + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* @@ -1621,9 +1643,14 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); + if (!folio_test_dirty(folio)) + folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); + + if (folio_dropbehind) + folio_end_dropbehind_write(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); @@ -1946,6 +1973,8 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) @@ -1956,8 +1985,19 @@ no_page: if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -1968,6 +2008,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -2450,18 +2493,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2477,7 +2524,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2485,7 +2532,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2506,6 +2554,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2515,7 +2565,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2530,20 +2579,21 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2584,6 +2634,20 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct address_space *mapping, + struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + if (folio_test_clear_dropbehind(folio)) + folio_unmap_invalidate(mapping, folio, 0); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2697,8 +2761,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(mapping, folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); @@ -2839,8 +2907,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2897,7 +2964,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -2957,7 +3024,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } @@ -4027,17 +4094,6 @@ retry: bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } - if (fatal_signal_pending(current)) { status = -EINTR; break; @@ -4055,6 +4111,12 @@ retry: if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); @@ -4080,6 +4142,16 @@ retry: bytes = copied; goto retry; } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } } else { pos += status; written += status; @@ -4376,6 +4448,20 @@ resched: } /* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + +/* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the @@ -4430,6 +4516,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + if (flags != 0) return -EINVAL; @@ -596,6 +596,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, } #endif /* CONFIG_HAVE_GUP_FAST */ +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) +{ + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { @@ -622,6 +649,18 @@ static struct page *no_page_table(struct vm_area_struct *vma, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + return can_follow_write_common(page, vma, flags); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) @@ -634,10 +673,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, assert_spin_locked(pud_lockptr(mm, pudp)); - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if (!pud_present(pud)) return NULL; - if (!pud_present(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; @@ -686,27 +726,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, if (pmd_write(pmd)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -807,27 +827,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, if (pte_write(pte)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -1294,9 +1294,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -2257,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2269,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } @@ -2323,13 +2320,13 @@ static void pofs_unpin(struct pages_or_folios *pofs) /* * Returns the number of collected folios. Return value is always >= 0. */ -static unsigned long collect_longterm_unpinnable_folios( +static void collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { - unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; + unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); @@ -2341,13 +2338,11 @@ static unsigned long collect_longterm_unpinnable_folios( if (folio_is_longterm_pinnable(folio)) continue; - collected++; - if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_folio_list); + folio_isolate_hugetlb(folio, movable_folio_list); continue; } @@ -2364,8 +2359,6 @@ static unsigned long collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } - - return collected; } /* @@ -2442,11 +2435,9 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); - unsigned long collected; - collected = collect_longterm_unpinnable_folios(&movable_folio_list, - pofs); - if (!collected) + collect_longterm_unpinnable_folios(&movable_folio_list, pofs); + if (list_empty(&movable_folio_list)) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); @@ -3360,8 +3351,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, return 0; if (gup_flags & FOLL_PIN) { - seq = raw_read_seqcount(¤t->mm->write_protect_seq); - if (seq & 1) + if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq)) return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db64116a4f84..373781b21e5c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); +DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = { #ifndef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -2003,7 +2009,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); @@ -3284,7 +3290,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); - ClearPageHasHWPoisoned(head); + folio_clear_has_hwpoisoned(folio); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { struct folio *tail; @@ -3298,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); - folio_put(tail); + folio_put_refs(tail, folio_nr_pages(tail)); } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); @@ -4175,20 +4181,21 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, if (input_buf[0] == '/') { char *tok; - char *buf = input_buf; + char *tok_buf = input_buf; char file_path[MAX_INPUT_BUF_SZ]; pgoff_t off_start = 0, off_end = 0; size_t input_len = strlen(input_buf); - tok = strsep(&buf, ","); - if (tok && buf) { + tok = strsep(&tok_buf, ","); + if (tok && tok_buf) { strscpy(file_path, tok); } else { ret = -EINVAL; goto out; } - ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); + ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start, + &off_end, &new_order); if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eaaec19caa7c..318624c96584 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -48,6 +48,7 @@ #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" +#include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -1246,69 +1247,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) -{ - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; -} - static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1336,6 +1274,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -1394,8 +1335,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1404,15 +1344,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) - goto err; - - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1430,11 +1365,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -2205,6 +2135,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2227,7 +2159,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2247,7 +2181,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } @@ -2463,7 +2397,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; int node; - nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + nodemask_t *mbind_nodemask, alloc_nodemask; + + mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + if (mbind_nodemask) + nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); + else + alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2479,8 +2419,16 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; - for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) { + + /* Prioritize current node */ + if (node_isset(numa_mem_id(), alloc_nodemask)) + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + numa_mem_id(), NULL); + + if (!folio) { + for_each_node_mask(node, alloc_nodemask) { + if (node == numa_mem_id()) + continue; folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), node, NULL); if (folio) @@ -2868,7 +2816,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - isolated = isolate_hugetlb(old_folio, list); + isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; @@ -2953,7 +2901,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -2961,69 +2909,137 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct hstate *h; + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); + } else { + start_pfn++; + continue; + } + + if (!folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(h, folio, + &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); - long gbl_chg; - int memcg_charge_ret, ret, idx; + long retval, gbl_chg; + map_chg_state map_chg; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || avoid_reserve) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; - + } else { /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. + * If we have the vma reservation ready, no need for extra + * global reservation. */ - if (avoid_reserve) - gbl_chg = 1; + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || avoid_reserve; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3040,27 +3056,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3069,50 +3090,61 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); - if (deferred_reserve) { - spin_lock_irq(&hugetlb_lock); - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); - spin_unlock_irq(&hugetlb_lock); + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); - mem_cgroup_put(memcg); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) + if (map_chg) hugepage_subpool_put_pages(spool, 1); out_end_reservation: - vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3125,7 +3157,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; @@ -3289,7 +3321,7 @@ static void __init gather_bootmem_prealloc(void) .thread_fn = gather_bootmem_prealloc_parallel, .fn_arg = NULL, .start = 0, - .size = num_node_state(N_MEMORY), + .size = nr_node_ids, .align = 1, .min_chunk = 1, .max_threads = num_node_state(N_MEMORY), @@ -3806,13 +3838,15 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); + /* Careful: see __split_huge_page_tail() */ + struct folio *new_folio = (struct folio *)page; - page->mapping = NULL; clear_compound_head(page); prep_compound_page(page, dst->order); - init_new_hugetlb_folio(dst, page_folio(page)); - list_add(&page->lru, &dst_list); + new_folio->mapping = NULL; + init_new_hugetlb_folio(dst, new_folio); + list_add(&new_folio->lru, &dst_list); } } @@ -4845,7 +4879,7 @@ out: return ret; } -static struct ctl_table hugetlb_table[] = { +static const struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, @@ -5141,12 +5175,12 @@ const struct vm_operations_struct hugetlb_vm_ops = { }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, - int writable) + bool try_mkwrite) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); - if (writable) { + if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { @@ -5169,6 +5203,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5199,7 +5240,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); + pte_t newpte = make_huge_pte(vma, &new_folio->page, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5333,7 +5374,7 @@ again: spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); @@ -5418,7 +5459,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); @@ -5593,7 +5634,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); @@ -5799,7 +5840,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; @@ -5814,13 +5855,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, if (!unshare && huge_pte_uffd_wp(pte)) return 0; - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); @@ -5849,7 +5883,8 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, vmf->address, vmf->pte); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5868,7 +5903,7 @@ retry_avoidcopy: */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5877,7 +5912,7 @@ retry_avoidcopy: * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); - new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5887,7 +5922,7 @@ retry_avoidcopy: * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -6138,7 +6173,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, goto out; } - folio = alloc_hugetlb_folio(vma, vmf->address, 0); + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6235,8 +6270,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) - && (vma->vm_flags & VM_SHARED))); + new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. @@ -6568,7 +6602,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; - int writable; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { @@ -6606,7 +6639,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6648,7 +6681,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; @@ -6722,12 +6755,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_enabled || (is_continue && !vm_shared)) - writable = 0; - else - writable = dst_vma->vm_flags & VM_WRITE; - - _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, + !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -7406,7 +7435,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -bool isolate_hugetlb(struct folio *folio, struct list_head *list) +/** + * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio + * @folio: the folio to isolate + * @list: the list to add the folio to on success + * + * Isolate an allocated (refcount > 0) hugetlb folio, marking it as + * isolated/non-migratable, and moving it from the active list to the + * given list. + * + * Isolation will fail if @folio is not an allocated hugetlb folio, or if + * it is already isolated/non-migratable. + * + * On success, an additional folio reference is taken that must be dropped + * using folio_putback_hugetlb() to undo the isolation. + * + * Return: True if isolation worked, otherwise False. + */ +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; @@ -7454,7 +7500,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void folio_putback_active_hugetlb(struct folio *folio) +/** + * folio_putback_hugetlb - unisolate a hugetlb folio + * @folio: the isolated hugetlb folio + * + * Putback/un-isolate the hugetlb folio that was previous isolated using + * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it + * back onto the active list. + * + * Will drop the additional folio reference obtained through + * folio_isolate_hugetlb(). + */ +void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); @@ -7501,6 +7558,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } spin_unlock_irq(&hugetlb_lock); } + + /* + * Our old folio is isolated and has "migratable" cleared until it + * is putback. As migration succeeded, set the new folio "migratable" + * and add it to the active list. + */ + spin_lock_irq(&hugetlb_lock); + folio_set_hugetlb_migratable(new_folio); + list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); + spin_unlock_irq(&hugetlb_lock); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e716c4671a15..bb9578bd99f9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -195,24 +195,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; - struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); - struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_folio(folio); + hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ - if (!page_hcg || page_hcg != h_cg) + if (!hcg || hcg != h_cg) goto out; - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ @@ -235,13 +234,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; - struct page *page; + struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); + list_for_each_entry(folio, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } @@ -917,7 +916,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - return; } static struct cftype hugetlb_files[] = { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 57b7f591eee8..7735972add01 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -693,7 +693,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l free_vmemmap_page_list(&vmemmap_pages); } -static struct ctl_table hugetlb_vmemmap_sysctls[] = { +static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c809379274..6af3ad675930 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,7 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/mm/internal.h b/mm/internal.h index 9826f7dce607..20b3535935a3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -392,6 +392,8 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); @@ -735,15 +737,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) extern void prep_compound_page(struct page *page, unsigned int order); -extern void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags); +void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -void free_unref_page(struct page *page, unsigned int order); +struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, + nodemask_t *); +#define __alloc_frozen_pages(...) \ + alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) +void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); +#ifdef CONFIG_NUMA +struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); +#else +static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) +{ + return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); +} +#endif + +#define alloc_frozen_pages(...) \ + alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); @@ -824,10 +841,6 @@ int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype); - /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); @@ -1102,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -1125,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else -static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { + return -EBUSY; } #endif @@ -1440,22 +1454,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -static inline int can_do_mseal(unsigned long flags) -{ - if (flags) - return -EINVAL; - - return 0; -} - -#else -static inline int can_do_mseal(unsigned long flags) -{ - return -EPERM; -} -#endif - #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) @@ -1530,4 +1528,23 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +/* pt_reclaim.c */ +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval); +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb); + +#ifdef CONFIG_PT_RECLAIM +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details); +#else +static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return false; +} +#endif /* CONFIG_PT_RECLAIM */ + + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 8b9e348113b1..d54e89f8c3e7 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) +/* + * This function avoids dynamic memory allocations and thus can be called from + * contexts that do not allow allocating memory. + */ +void kasan_record_aux_stack(void *addr) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); -} - -void kasan_record_aux_stack(void *addr) -{ - return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); -} - -void kasan_record_aux_stack_noalloc(void *addr) -{ - return __kasan_record_aux_stack(addr, 0); + alloc_meta->aux_stack[0] = kasan_save_stack(0, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index ccd66c7a4081..9a6927394b54 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -16,6 +16,7 @@ #include <linux/mm.h> #include <linux/static_key.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> #include <linux/vmalloc.h> @@ -263,8 +264,8 @@ void __init kasan_init_hw_tags(void) pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), - kasan_vmalloc_enabled() ? "on" : "off", - kasan_stack_collection_enabled() ? "on" : "off"); + str_on_off(kasan_vmalloc_enabled()), + str_on_off(kasan_stack_collection_enabled())); } #ifdef CONFIG_KASAN_VMALLOC diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b7e4b81421b3..129178be5e64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, must be aligned to KASAN_GRANULE_SIZE - * @value - value that's written to metadata for the range - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, must be aligned to KASAN_GRANULE_SIZE + * @value: value that's written to metadata for the range + * @init: whether to initialize the memory range (only for hardware tag-based) */ void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, can be unaligned - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, can be unaligned + * @init: whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. @@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as * inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @address: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size * * This function is only available for the generic mode, as it's the only mode * that has partially poisoned memory granules. diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 99d4ff0ed57a..59d673400085 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -47,8 +47,8 @@ static struct { * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. */ -void *kasan_ptr_result; -int kasan_int_result; +static volatile void *kasan_ptr_result; +static volatile int kasan_int_result; /* Probe for console output: obtains test_status lines of interest. */ static void probe_console(void *ignore, const char *buf, size_t len) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3fe77a360f1c..8357e1a33699 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +/* + * This function is invoked with report_lock (a raw_spinlock) held. A + * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping + * rt_spinlock. + * + * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a + * lockdep warning for this raw_spinlock -> spinlock dependency. This config + * option is enabled by default to ensure better test coverage to expose this + * kind of RT kernel problem. This lockdep splat, however, can be suppressed + * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the + * invalid PREEMPT_RT case has been taken care of. + */ +static inline struct vm_struct *kasan_find_vm_area(void *addr) +{ + static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); + struct vm_struct *va; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return NULL; + + /* + * Suppress lockdep warning and fetch vmalloc area of the + * offending address. + */ + lock_map_acquire_try(&vmalloc_map); + va = find_vm_area(addr); + lock_map_release(&vmalloc_map); + return va; +} + static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = find_vm_area(addr); + struct vm_struct *va = kasan_find_vm_area(addr); if (va) { pr_err("The buggy address belongs to the virtual mapping at\n" @@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); page = vmalloc_to_page(addr); + } else { + pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); } } diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 220b5d4c6876..b9382b5b6a37 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> @@ -45,7 +46,7 @@ void __init kasan_init_sw_tags(void) kasan_init_tags(); pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", - kasan_stack_collection_enabled() ? "on" : "off"); + str_on_off(kasan_stack_collection_enabled())); } /* diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 67fc321db79b..102048821c22 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -21,6 +21,7 @@ #include <linux/log2.h> #include <linux/memblock.h> #include <linux/moduleparam.h> +#include <linux/nodemask.h> #include <linux/notifier.h> #include <linux/panic_notifier.h> #include <linux/random.h> @@ -1084,6 +1085,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || + ((flags & __GFP_THISNODE) && num_online_nodes() > 1) || (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index f65fb182466d..00034e37bc9f 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/tracepoint.h> #include <trace/events/printk.h> @@ -88,7 +89,7 @@ struct expect_report { static const char *get_access_type(const struct expect_report *r) { - return r->is_write ? "write" : "read"; + return str_write_read(r->is_write); } /* Check observed report matches information in @r. */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 6370c5207d1a..10e6802a2edf 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -16,6 +16,7 @@ #include <linux/sprintf.h> #include <linux/stacktrace.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/sched/clock.h> #include <trace/events/error_report.h> @@ -184,7 +185,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show, static const char *get_access_type(bool is_write) { - return is_write ? "write" : "read"; + return str_write_read(is_write); } void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bad1e130eda8..5f0be134141e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -948,17 +948,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static inline int check_pmd_state(pmd_t *pmd) { - pmd_t pmde; + pmd_t pmde = pmdp_get_lockless(pmd); - *pmd = mm_find_pmd(mm, address); - if (!*pmd) - return SCAN_PMD_NULL; - - pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) @@ -972,6 +965,17 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + return check_pmd_state(*pmd); +} + static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) @@ -1721,7 +1725,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; - bool skipped_uffd = false; + bool success = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that @@ -1758,6 +1762,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); @@ -1771,20 +1788,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * repeating the anon_vma check protects from one category, * and repeating the userfaultfd_wp() check from another. */ - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { - skipped_uffd = true; - } else { + if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); + success = true; } if (ptl != pml) spin_unlock(ptl); +drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); - if (!skipped_uffd) { + if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 820ba3b5cbfc..c6ed68604136 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1689,7 +1689,7 @@ static void kmemleak_scan(void) unsigned long phys = object->pointer; if (PHYS_PFN(phys) < min_low_pfn || - PHYS_PFN(phys + object->size) >= max_low_pfn) + PHYS_PFN(phys + object->size) > max_low_pfn) __paint_it(object, KMEMLEAK_BLACK); } @@ -1855,7 +1855,7 @@ static int kmemleak_scan_thread(void *arg) * Wait before the first scan to allow the system to fully initialize. */ if (first_run) { - signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); + signed long timeout = secs_to_jiffies(SECS_FIRST_SCAN); first_run = 0; while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); @@ -2241,7 +2241,7 @@ void __init kmemleak_init(void) return; jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); - jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + jiffies_scan_wait = secs_to_jiffies(SECS_SCAN_WAIT); object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3ea50f09311f..3df45c25c1f6 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, size -= to_go; } } +EXPORT_SYMBOL_GPL(kmsan_handle_dma); void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 9c58f081d84f..1bb505a08415 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -280,12 +280,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) start = (void *)PAGE_ALIGN_DOWN((u64)start); size = PAGE_ALIGN((u64)end - (u64)start); - shadow = memblock_alloc(size, PAGE_SIZE); - origin = memblock_alloc(size, PAGE_SIZE); - - if (!shadow || !origin) - panic("%s: Failed to allocate metadata memory for early boot range of size %llu", - __func__, size); + shadow = memblock_alloc_or_panic(size, PAGE_SIZE); + origin = memblock_alloc_or_panic(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); @@ -3262,6 +3262,25 @@ static void wait_while_offlining(void) #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS +/* + * The process is mergeable only if any VMA is currently + * applicable to KSM. + * + * The mmap lock must be held in read mode. + */ +bool ksm_process_mergeable(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + VMA_ITERATOR(vmi, mm, 0); + for_each_vma(vmi, vma) + if (vma->vm_flags & VM_MERGEABLE) + return true; + + return false; +} + long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - diff --git a/mm/madvise.c b/mm/madvise.c index 0ceae57da7da..08b207f8e61e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -851,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single(vma, start, end - start, &details); return 0; } @@ -928,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) diff --git a/mm/memblock.c b/mm/memblock.c index 095c18b5c430..95af35fd1389 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1692,6 +1692,26 @@ void * __init memblock_alloc_try_nid( } /** + * __memblock_alloc_or_panic - Try to allocate memory and panic on failure + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @func: caller func name + * + * This function attempts to allocate memory using memblock_alloc, + * and in case of failure, it calls panic with the formatted message. + * This function should not be used directly, please use the macro memblock_alloc_or_panic. + */ +void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func) +{ + void *addr = memblock_alloc(size, align); + + if (unlikely(!addr)) + panic("%s: Failed to allocate %pap bytes\n", func, &size); + return addr; +} + +/** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a071fa43d479..2e9fa431bbf5 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -899,7 +899,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct mem_cgroup_event *event = @@ -1040,13 +1040,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, } else if (!strcmp(name, "memory.oom_control")) { pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org" - " if you depend on this functionality. \n"); + " if you depend on this functionality.\n"); event->register_event = mem_cgroup_oom_register_event; event->unregister_event = mem_cgroup_oom_unregister_event; } else if (!strcmp(name, "memory.pressure_level")) { pr_warn_once("pressure_level is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org " - "if you depend on this functionality. \n"); + "if you depend on this functionality.\n"); event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { @@ -1134,8 +1134,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) failed = iter; mem_cgroup_iter_break(memcg, iter); break; - } else - iter->oom_lock = true; + } + iter->oom_lock = true; } if (failed) { @@ -1202,7 +1202,7 @@ struct oom_wait_info { }; static int memcg_oom_wake_function(wait_queue_entry_t *wait, - unsigned mode, int sync, void *arg) + unsigned int mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; struct mem_cgroup *oom_wait_memcg; @@ -1644,7 +1644,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, unsigned long nr = 0; enum lru_list lru; - VM_BUG_ON((unsigned)nid >= nr_node_ids); + VM_BUG_ON((unsigned int)nid >= nr_node_ids); for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) @@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n "); WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; @@ -1881,7 +1883,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org if you " - "depend on this functionality. \n"); + "depend on this functionality.\n"); /* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b3503d12aaf..a037ec92881d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; + int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct task_struct *task; css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); - while (!ret && (task = css_task_iter_next(&it))) + while (!ret && (task = css_task_iter_next(&it))) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + cond_resched(); ret = fn(task, arg); + } css_task_iter_end(&it); if (ret) { mem_cgroup_iter_break(memcg, iter); @@ -1448,6 +1453,18 @@ unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item) memcg_page_state_output_unit(item); } +#ifdef CONFIG_HUGETLB_PAGE +static bool memcg_accounts_hugetlb(void) +{ + return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; +} +#else /* CONFIG_HUGETLB_PAGE */ +static bool memcg_accounts_hugetlb(void) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE */ + static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) { int i; @@ -1469,7 +1486,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) #ifdef CONFIG_HUGETLB_PAGE if (unlikely(memory_stats[i].idx == NR_HUGETLB) && - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + !memcg_accounts_hugetlb()) continue; #endif size = memcg_page_state_output(memcg, memory_stats[i].idx); @@ -1904,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old; + unsigned long flags; stock = &per_cpu(memcg_stock, cpu); + + /* drain_obj_stock requires stock_lock */ + local_lock_irqsave(&memcg_stock.stock_lock, flags); + old = drain_obj_stock(stock); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + drain_stock(stock); + obj_cgroup_put(old); return 0; } @@ -2371,21 +2397,6 @@ done_restock: return 0; } -/** - * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. - * @memcg: memcg previously charged. - * @nr_pages: number of pages previously charged. - */ -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - if (mem_cgroup_is_root(memcg)) - return; - - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); -} - static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); @@ -2399,18 +2410,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -/** - * mem_cgroup_commit_charge - commit a previously successful try_charge(). - * @folio: folio to commit the charge to. - * @memcg: memcg previously charged. - */ -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) -{ - css_get(&memcg->css); - commit_charge(folio, memcg); - memcg1_commit_charge(folio, memcg); -} - static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) @@ -4176,6 +4175,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; + cond_resched(); } memcg_wb_domain_size_changed(memcg); @@ -4498,7 +4498,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, if (ret) goto out; - mem_cgroup_commit_charge(folio, memcg); + css_get(&memcg->css); + commit_charge(folio, memcg); + memcg1_commit_charge(folio, memcg); out: return ret; } @@ -4516,38 +4518,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio - * @memcg: memcg to charge. - * @gfp: reclaim mode. - * @nr_pages: number of pages to charge. - * - * This function is called when allocating a huge page folio to determine if - * the memcg has the capacity for it. It does not commit the charge yet, - * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio + * @folio: folio being charged + * @gfp: reclaim mode * - * Once we have obtained the hugetlb folio, we can call - * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the - * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect - * of try_charge(). + * This function is called when allocating a huge page folio, after the page has + * already been obtained and charged to the appropriate hugetlb cgroup + * controller (if it is enabled). * - * Returns 0 on success. Otherwise, an error code is returned. + * Returns ENOMEM if the memcg is already full. + * Returns 0 if either the charge was successful, or if we skip the charging. */ -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages) +int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp) { + struct mem_cgroup *memcg = get_mem_cgroup_from_current(); + int ret = 0; + /* - * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, - * but do not attempt to commit charge later (or cancel on error) either. + * Even memcg does not account for hugetlb, we still want to update + * system-level stats via lruvec_stat_mod_folio. Return 0, and skip + * charging the memcg. */ - if (mem_cgroup_disabled() || !memcg || - !cgroup_subsys_on_dfl(memory_cgrp_subsys) || - !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) - return -EOPNOTSUPP; + if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() || + !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + goto out; - if (try_charge(memcg, gfp, nr_pages)) - return -ENOMEM; + if (charge_memcg(folio, memcg, gfp)) + ret = -ENOMEM; - return 0; +out: + mem_cgroup_put(memcg); + return ret; } /** @@ -4609,7 +4610,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed. */ - if (!mem_cgroup_disabled() && do_memsw_account()) { + if (do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a @@ -4973,7 +4974,6 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; unsigned int nr_entries; - unsigned short oldid; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); @@ -5000,11 +5000,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_entries > 1) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), - nr_entries); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); + folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5035,7 +5034,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) unsigned int nr_pages = folio_nr_pages(folio); struct page_counter *counter; struct mem_cgroup *memcg; - unsigned short oldid; if (do_memsw_account()) return 0; @@ -5064,10 +5062,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) /* Get references for the tail pages, too */ if (nr_pages > 1) mem_cgroup_id_get_many(memcg, nr_pages - 1); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); - VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); + swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + return 0; } @@ -5081,7 +5079,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - id = swap_cgroup_record(entry, 0, nr_pages); + id = swap_cgroup_clear(entry, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { diff --git a/mm/memfd.c b/mm/memfd.c index 35a370d75c9a..37f7be57c2f5 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -unsigned int *memfd_file_seals_ptr(struct file *file) +static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; @@ -327,15 +327,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags) return 0; } -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) +static inline bool is_write_sealed(unsigned int seals) { - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +static int check_write_seal(unsigned long *vm_flags_ptr) +{ + unsigned long vm_flags = *vm_flags_ptr; + unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + + /* If a private matting then writability is irrelevant. */ + if (!(mask & VM_SHARED)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if (mask & VM_WRITE) + return -EPERM; + + /* + * This is a read-only mapping, disallow mprotect() from making a + * write-sealed mapping writable in future. + */ + *vm_flags_ptr &= ~VM_MAYWRITE; + + return 0; +} + +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +{ + int err = 0; + unsigned int *seals_ptr = memfd_file_seals_ptr(file); + unsigned int seals = seals_ptr ? *seals_ptr : 0; + + if (is_write_sealed(seals)) + err = check_write_seal(vm_flags_ptr); + + return err; +} + +static int sanitize_flags(unsigned int *flags_ptr) +{ + unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) @@ -351,50 +387,52 @@ SYSCALL_DEFINE2(memfd_create, if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - error = check_sysctl_memfd_noexec(&flags); - if (error < 0) - return error; + return check_sysctl_memfd_noexec(flags_ptr); +} - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; +static char *alloc_name(const char __user *uname) +{ + int error; + char *name; + long len; - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + /* returned length does not include terminating zero */ + len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); + if (len < 0) { error = -EFAULT; goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; + } else if (len > MFD_NAME_MAX_LEN) { + error = -EINVAL; goto err_name; } - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } + return name; + +err_name: + kfree(name); + return ERR_PTR(error); +} + +static struct file *alloc_file(const char *name, unsigned int flags) +{ + unsigned int *file_seals; + struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; } + if (IS_ERR(file)) + return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; @@ -414,6 +452,37 @@ SYSCALL_DEFINE2(memfd_create, *file_seals &= ~F_SEAL_SEAL; } + return file; +} + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct file *file; + int fd, error; + char *name; + + error = sanitize_flags(&flags); + if (error < 0) + return error; + + name = alloc_name(uname); + if (IS_ERR(name)) + return PTR_ERR(name); + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = alloc_file(name, flags); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + fd_install(fd, file); kfree(name); return fd; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a7b8ccd29b6f..327e02fdc029 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; -static struct ctl_table memory_failure_table[] = { +static const struct ctl_table memory_failure_table[] = { { .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, @@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } -void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - struct address_space *mapping; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; + struct address_space *mapping; + + if (folio_test_swapcache(folio)) { + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); + ttu &= ~TTU_HWPOISON; + } + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = folio_mapping(folio); + if (!must_kill && !folio_test_dirty(folio) && mapping && + mapping_can_writeback(mapping)) { + if (folio_mkclean(folio)) { + folio_set_dirty(folio); + } else { + ttu &= ~TTU_HWPOISON; + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { /* * For hugetlb folios in shared mappings, try_to_unmap * could potentially call huge_pmd_unshare. Because of @@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) if (!mapping) { pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", folio_pfn(folio)); - return; + return -EBUSY; } try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); @@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) } else { try_to_unmap(folio, ttu); } + + return folio_mapped(folio) ? -EBUSY : 0; } /* @@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) static bool hwpoison_user_mappings(struct folio *folio, struct page *p, unsigned long pfn, int flags) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; - struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int forcekill; @@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, if (!folio_mapped(folio)) return true; - if (folio_test_swapcache(folio)) { - pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); - ttu &= ~TTU_HWPOISON; - } - - /* - * Propagate the dirty bit from PTEs to struct page first, because we - * need this to decide if we should kill or just drop the page. - * XXX: the dirty test could be racy: set_page_dirty() may not always - * be called inside page lock (it's recommended but not enforced). - */ - mapping = folio_mapping(folio); - if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping && - mapping_can_writeback(mapping)) { - if (folio_mkclean(folio)) { - folio_set_dirty(folio); - } else { - ttu &= ~TTU_HWPOISON; - pr_info("%#lx: corrupted page was clean: dropped without side effects\n", - pfn); - } - } - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, @@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - unmap_poisoned_folio(folio, ttu); - - unmap_success = !folio_mapped(folio); + unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL); if (!unmap_success) pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n", pfn, folio_mapcount(folio)); diff --git a/mm/memory.c b/mm/memory.c index 398c031be9ba..4f6d9766a046 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1466,34 +1466,42 @@ static inline bool zap_drop_markers(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; if (zap_drop_markers(details)) - return; + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } +#endif + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1519,8 +1527,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1544,7 +1552,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; @@ -1559,15 +1567,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible @@ -1579,14 +1589,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + swp_entry_t entry; + int nr = 1; + + *any_skipped = true; + entry = pte_to_swp_entry(ptent); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + if (is_device_private_entry(entry)) + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (!non_swap_entry(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (is_migration_entry(entry)) { + struct folio *folio = pfn_swap_entry_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (is_guard_swp_entry(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1598,9 +1715,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = true; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1610,90 +1731,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; + bool any_skipped = false; - nr = 1; - if (pte_none(ptent)) - continue; - - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; - - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; } - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_markers(details)) - continue; - } else if (is_guard_swp_entry(entry)) { - /* - * Ordinary zapping should not remove guard PTE - * markers. Only do so if we should remove PTE markers - * in general. - */ - if (!zap_drop_markers(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + direct_reclaim = false; + break; } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1713,6 +1779,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1935,7 +2015,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_notifier_range range; struct mmu_gather tlb; - lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); @@ -2971,8 +3050,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return -EINVAL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) { + err = -EINVAL; + break; + } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; @@ -3013,7 +3094,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -4189,8 +4269,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } @@ -5102,7 +5184,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; - unsigned long addr = vmf->address; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -5141,7 +5227,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -5177,9 +5264,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); - ret = VM_FAULT_NOPAGE; - goto unlock; + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } folio_ref_add(folio, nr_pages - 1); @@ -5625,7 +5712,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); @@ -6069,7 +6156,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -6185,7 +6273,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r /* * Helper for page fault handling. * - * This is kind of equivalend to "mmap_read_lock()" followed + * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * @@ -6746,10 +6834,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif @@ -6961,7 +7047,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c43b4e7fb298..16cf9e17077e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -219,11 +219,30 @@ void put_online_mems(void) bool movable_node_enabled = false; -#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -int mhp_default_online_type = MMOP_OFFLINE; -#else -int mhp_default_online_type = MMOP_ONLINE; -#endif +static int mhp_default_online_type = -1; +int mhp_get_default_online_type(void) +{ + if (mhp_default_online_type >= 0) + return mhp_default_online_type; + + if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE)) + mhp_default_online_type = MMOP_OFFLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO)) + mhp_default_online_type = MMOP_ONLINE; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL)) + mhp_default_online_type = MMOP_ONLINE_KERNEL; + else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE)) + mhp_default_online_type = MMOP_ONLINE_MOVABLE; + else + mhp_default_online_type = MMOP_OFFLINE; + + return mhp_default_online_type; +} + +void mhp_set_default_online_type(int online_type) +{ + mhp_default_online_type = online_type; +} static int __init setup_memhp_default_state(char *str) { @@ -650,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { + struct page *page = pfn_to_page(pfn); int order; /* @@ -664,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) else order = MAX_PAGE_ORDER; - (*online_page_callback)(pfn_to_page(pfn), order); + /* + * Exposing the page to the buddy by freeing can cause + * issues with debug_pagealloc enabled: some archs don't + * like double-unmappings. So treat them like any pages that + * were allocated from the buddy. + */ + debug_pagealloc_map_pages(page, 1 << order); + (*online_page_callback)(page, order); pfn += (1UL << order); } @@ -1320,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { - mem->online_type = mhp_default_online_type; + mem->online_type = mhp_get_default_online_type(); return device_online(&mem->dev); } @@ -1567,7 +1594,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) merge_system_ram_resource(res); /* online pages if requested */ - if (mhp_default_online_type != MMOP_OFFLINE) + if (mhp_get_default_online_type() != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1795,26 +1822,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (folio_test_large(folio)) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; - /* - * HWPoison pages have elevated reference counts so the migration would - * fail on them. It also doesn't make any sense to migrate them in the - * first place. Still try to unmap such a page in case it is still mapped - * (keep the unmap as the catch all safety net). - */ + if (!folio_try_get(folio)) + continue; + + if (unlikely(page_folio(page) != folio)) + goto put_folio; + if (folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); - if (folio_mapped(folio)) - unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); - continue; - } - - if (!folio_try_get(folio)) - continue; + if (folio_mapped(folio)) { + folio_lock(folio); + unmap_poisoned_folio(folio, pfn, false); + folio_unlock(folio); + } - if (unlikely(page_folio(page) != folio)) goto put_folio; + } if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { @@ -1830,7 +1855,7 @@ put_folio: nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, .reason = MR_MEMORY_HOTPLUG, }; int ret; @@ -1992,8 +2017,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - MEMORY_OFFLINE | REPORT_FAILURE, - GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 162407fbf2bc..a9eea051b0d6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state) } EXPORT_SYMBOL_GPL(numa_nearest_node); +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; @@ -647,7 +678,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) - if (!isolate_hugetlb(folio, qp->pagelist)) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); @@ -2205,9 +2236,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask); if (!page) - page = __alloc_pages_noprof(gfp, order, nid, NULL); + page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL); return page; } @@ -2222,7 +2253,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2253,8 +2284,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - page = __alloc_pages_node_noprof(nid, - gfp | __GFP_THISNODE | __GFP_NORETRY, order); + page = __alloc_frozen_pages_noprof( + gfp | __GFP_THISNODE | __GFP_NORETRY, order, + nid, NULL); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* @@ -2266,7 +2298,7 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, } } - page = __alloc_pages_noprof(gfp, order, nid, nodemask); + page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE || pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) { @@ -2285,8 +2317,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { - return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP, - order, pol, ilx, nid)); + struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, + ilx, nid); + if (!page) + return NULL; + + set_page_refcounted(page); + return page_rmappable_folio(page); } /** @@ -2300,7 +2337,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and - * excepting where direct use of alloc_pages_mpol() is more appropriate. + * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ @@ -2321,6 +2358,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct } EXPORT_SYMBOL(vma_alloc_folio_noprof); +struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, + numa_node_id()); +} + /** * alloc_pages - Allocate pages. * @gfp: GFP flags. @@ -2337,17 +2389,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof); */ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { - struct mempolicy *pol = &default_policy; - - /* - * No reference counting needed for current->mempolicy - * nor system default_policy - */ - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); + struct page *page = alloc_frozen_pages_noprof(gfp, order); - return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, - numa_node_id()); + if (page) + set_page_refcounted(page); + return page; } EXPORT_SYMBOL(alloc_pages_noprof); @@ -2357,7 +2403,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) } EXPORT_SYMBOL(folio_alloc_noprof); -static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2376,13 +2422,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, if (delta) { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, + nr_pages_per_node + 1, page_array); delta--; } else { nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); + nr_pages_per_node, page_array); } page_array += nr_allocated; @@ -2392,7 +2438,7 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, +static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2431,7 +2477,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (weight && node_isset(node, nodes)) { node_pages = min(rem_pages, weight); nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; /* if that's all the pages, no need to interleave */ @@ -2494,7 +2540,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, if (!node_pages) break; nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages, - NULL, page_array); + page_array); page_array += nr_allocated; total_allocated += nr_allocated; if (total_allocated == nr_pages) @@ -2507,7 +2553,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp, return total_allocated; } -static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, +static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { @@ -2518,11 +2564,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, - nr_pages, NULL, page_array); + nr_pages, page_array); if (nr_allocated < nr_pages) nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, - nr_pages - nr_allocated, NULL, + nr_pages - nr_allocated, page_array + nr_allocated); return nr_allocated; } @@ -2533,7 +2579,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2544,21 +2590,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) - return alloc_pages_bulk_array_interleave(gfp, pol, + return alloc_pages_bulk_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_WEIGHTED_INTERLEAVE) - return alloc_pages_bulk_array_weighted_interleave( + return alloc_pages_bulk_weighted_interleave( gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) - return alloc_pages_bulk_array_preferred_many(gfp, + return alloc_pages_bulk_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return alloc_pages_bulk_noprof(gfp, nid, nodemask, - nr_pages, NULL, page_array); + nr_pages, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) diff --git a/mm/migrate.c b/mm/migrate.c index cc68583c86f9..97f0edf0c032 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -68,10 +68,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) if (!folio) goto out; - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; - /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ - smp_rmb(); /* * Check movable flag before taking the page lock because * we use non-atomic bitops on newly allocated page flags so @@ -79,10 +75,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) */ if (unlikely(!__folio_test_movable(folio))) goto out_putfolio; - /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ - smp_rmb(); - if (unlikely(folio_test_slab(folio))) - goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -136,7 +128,7 @@ static void putback_movable_folio(struct folio *folio) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_hugetlb(). + * and folio_isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -145,7 +137,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { - folio_putback_active_hugetlb(folio); + folio_putback_hugetlb(folio); continue; } list_del(&folio->lru); @@ -177,7 +169,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) bool isolated, lru; if (folio_test_hugetlb(folio)) - return isolate_hugetlb(folio, list); + return folio_isolate_hugetlb(folio, list); lru = !__folio_test_movable(folio); if (lru) @@ -526,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ - if (folio_test_swapbacked(folio)) { + if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - if (folio_test_swapcache(folio)) { - folio_set_swapcache(newfolio); - newfolio->private = folio_get_private(folio); - } + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); entries = nr; } else { - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); entries = 1; } @@ -1459,7 +1449,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1542,19 +1532,19 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, put_page() will drop the reference grabbed during - * isolation. + * If migration was not successful and there's a freeing callback, + * return the folio to that special allocator. Otherwise, simply drop + * our additional reference. */ if (put_new_folio) put_new_folio(dst, private); else - folio_putback_active_hugetlb(dst); + folio_put(dst); return rc; } @@ -1695,6 +1685,81 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, return nr_failed; } +static void migrate_folios_move(struct list_head *src_folios, + struct list_head *dst_folios, + free_folio_t put_new_folio, unsigned long private, + enum migrate_mode mode, int reason, + struct list_head *ret_folios, + struct migrate_pages_stats *stats, + int *retry, int *thp_retry, int *nr_failed, + int *nr_retry_pages) +{ + struct folio *folio, *folio2, *dst, *dst2; + bool is_thp; + int nr_pages; + int rc; + + dst = list_first_entry(dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, src_folios, lru) { + is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); + + cond_resched(); + + rc = migrate_folio_move(put_new_folio, private, + folio, dst, mode, + reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed + * -EAGAIN: stay on the unmap_folios list + * Other errno: put on ret_folios list + */ + switch (rc) { + case -EAGAIN: + *retry += 1; + *thp_retry += is_thp; + *nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + break; + default: + *nr_failed += 1; + stats->nr_thp_failed += is_thp; + stats->nr_failed_pages += nr_pages; + break; + } + dst = dst2; + dst2 = list_next_entry(dst, lru); + } +} + +static void migrate_folios_undo(struct list_head *src_folios, + struct list_head *dst_folios, + free_folio_t put_new_folio, unsigned long private, + struct list_head *ret_folios) +{ + struct folio *folio, *folio2, *dst, *dst2; + + dst = list_first_entry(dst_folios, struct folio, lru); + dst2 = list_next_entry(dst, lru); + list_for_each_entry_safe(folio, folio2, src_folios, lru) { + int old_page_state = 0; + struct anon_vma *anon_vma = NULL; + + __migrate_folio_extract(dst, &old_page_state, &anon_vma); + migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + anon_vma, true, ret_folios); + list_del(&dst->lru); + migrate_folio_undo_dst(dst, true, put_new_folio, private); + dst = dst2; + dst2 = list_next_entry(dst, lru); + } +} + /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. @@ -1717,7 +1782,7 @@ static int migrate_pages_batch(struct list_head *from, int pass = 0; bool is_thp = false; bool is_large = false; - struct folio *folio, *folio2, *dst = NULL, *dst2; + struct folio *folio, *folio2, *dst = NULL; int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); LIST_HEAD(dst_folios); @@ -1888,42 +1953,11 @@ move: thp_retry = 0; nr_retry_pages = 0; - dst = list_first_entry(&dst_folios, struct folio, lru); - dst2 = list_next_entry(dst, lru); - list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); - nr_pages = folio_nr_pages(folio); - - cond_resched(); - - rc = migrate_folio_move(put_new_folio, private, - folio, dst, mode, - reason, ret_folios); - /* - * The rules are: - * Success: folio will be freed - * -EAGAIN: stay on the unmap_folios list - * Other errno: put on ret_folios list - */ - switch(rc) { - case -EAGAIN: - retry++; - thp_retry += is_thp; - nr_retry_pages += nr_pages; - break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - default: - nr_failed++; - stats->nr_thp_failed += is_thp; - stats->nr_failed_pages += nr_pages; - break; - } - dst = dst2; - dst2 = list_next_entry(dst, lru); - } + /* Move the unmapped folios */ + migrate_folios_move(&unmap_folios, &dst_folios, + put_new_folio, private, mode, reason, + ret_folios, stats, &retry, &thp_retry, + &nr_failed, &nr_retry_pages); } nr_failed += retry; stats->nr_thp_failed += thp_retry; @@ -1932,20 +1966,8 @@ move: rc = rc_saved ? : nr_failed; out: /* Cleanup remaining folios */ - dst = list_first_entry(&dst_folios, struct folio, lru); - dst2 = list_next_entry(dst, lru); - list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - int old_page_state = 0; - struct anon_vma *anon_vma = NULL; - - __migrate_folio_extract(dst, &old_page_state, &anon_vma); - migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, - anon_vma, true, ret_folios); - list_del(&dst->lru); - migrate_folio_undo_dst(dst, true, put_new_folio, private); - dst = dst2; - dst2 = list_next_entry(dst, lru); - } + migrate_folios_undo(&unmap_folios, &dst_folios, + put_new_folio, private, ret_folios); return rc; } @@ -2208,7 +2230,7 @@ static int __add_folio_for_migration(struct folio *folio, int node, return -EACCES; if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) + if (folio_isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); @@ -2683,8 +2705,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio, * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 9cf26592ac93..5bd888223cc8 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns, dst = src; } + if (!folio_is_zone_device(dst)) + folio_add_lru(dst); remove_migration_ptes(src, dst, 0); folio_unlock(src); - - if (folio_is_zone_device(src)) - folio_put(src); - else - folio_putback_lru(src); + folio_put(src); if (dst != src) { folio_unlock(dst); - if (folio_is_zone_device(dst)) - folio_put(dst); - else - folio_putback_lru(dst); + folio_put(dst); } } } diff --git a/mm/mm_init.c b/mm/mm_init.c index 24b68b425afb..2630cc30147e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1585,13 +1585,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, { void *ptr; + /* + * Kmemleak will explicitly scan mem_map by traversing all valid + * `struct *page`,so memblock does not need to be added to the scan list. + */ if (exact_nid) ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); else ptr = memblock_alloc_try_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); if (ptr && size > 0) diff --git a/mm/mmap.c b/mm/mmap.c index aec208f90337..cda01071c7b1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -111,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, unsigned long flags); + SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -278,8 +277,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, return true; } -/* +/** + * do_mmap() - Perform a userland memory mapping into the current process + * address space of length @len with protection bits @prot, mmap flags @flags + * (from which VMA flags will be inferred), and any additional VMA flags to + * apply @vm_flags. If this is a file-backed mapping then the file is specified + * in @file and page offset into the file via @pgoff. + * + * This function does not perform security checks on the file and assumes, if + * @uf is non-NULL, the caller has provided a list head to track unmap events + * for userfaultfd @uf. + * + * It also simply indicates whether memory population is required by setting + * @populate, which must be non-NULL, expecting the caller to actually perform + * this task itself if appropriate. + * + * This function will invoke architecture-specific (and if provided and + * relevant, file system-specific) logic to determine the most appropriate + * unmapped area in which to place the mapping if not MAP_FIXED. + * + * Callers which require userland mmap() behaviour should invoke vm_mmap(), + * which is also exported for module use. + * + * Those which require this behaviour less security checks, userfaultfd and + * populate behaviour, and who handle the mmap write lock themselves, should + * call this function. + * + * Note that the returned address may reside within a merged VMA if an + * appropriate merge were to take place, so it doesn't necessarily specify the + * start of a VMA, rather only the start of a valid mapped range of length + * @len bytes, rounded down to the nearest page size. + * * The caller must write-lock current->mm->mmap_lock. + * + * @file: An optional struct file pointer describing the file which is to be + * mapped, if a file-backed mapping. + * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the + * address at which to perform this mapping. See mmap (2) for details. Must be + * page-aligned. + * @len: The length of the mapping. Will be page-aligned and must be at least 1 + * page in size. + * @prot: Protection bits describing access required to the mapping. See mmap + * (2) for details. + * @flags: Flags specifying how the mapping should be performed, see mmap (2) + * for details. + * @vm_flags: VMA flags which should be set by default, or 0 otherwise. + * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. + * @populate: A pointer to a value which will be set to 0 if no population of + * the range is required, or the number of bytes to populate if it is. Must be + * non-NULL. See mmap (2) for details as to under what circumstances population + * of the range occurs. + * @uf: An optional pointer to a list head to track userfaultfd unmap events + * should unmapping events arise. If provided, it is up to the caller to manage + * this. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -292,6 +345,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, *populate = 0; + mmap_assert_write_locked(mm); + if (!len) return -EINVAL; @@ -369,8 +424,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); - unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; + int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; @@ -410,8 +465,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - else if (is_readonly_sealed(seals, vm_flags)) - vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) @@ -431,6 +484,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } + + /* + * Check to see if we are violating any seals and update VMA + * flags if necessary to avoid future seal violations. + */ + err = memfd_check_seals_mmap(file, &vm_flags); + if (err) + return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: @@ -581,115 +642,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -/** - * unmapped_area() - Find an area between the low_limit and the high_limit with - * the correct alignment and offset, all from @info. Note: current->mm is used - * for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - /* - * Adjust for the gap first so it doesn't interfere with the - * later alignment. The first step is the minimum needed to - * fulill the start gap, the next steps is the minimum to align - * that. It is the minimum needed to fulill both. - */ - gap = vma_iter_addr(&vmi) + info->start_gap; - gap += (info->align_offset - gap) & info->align_mask; - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap + length - 1) { - low_limit = tmp->vm_end; - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - low_limit = vm_end_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - -/** - * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with the correct alignment and offset at the highest available - * address, all from @info. Note: current->mm is used for the search. - * - * @info: The unmapped area information including the range [low_limit - - * high_limit), the alignment offset and mask. - * - * Return: A memory address or -ENOMEM. - */ -static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) -{ - unsigned long length, gap, gap_end; - unsigned long low_limit, high_limit; - struct vm_area_struct *tmp; - VMA_ITERATOR(vmi, current->mm, 0); - - /* Adjust search length to account for worst case alignment overhead */ - length = info->length + info->align_mask + info->start_gap; - if (length < info->length) - return -ENOMEM; - - low_limit = info->low_limit; - if (low_limit < mmap_min_addr) - low_limit = mmap_min_addr; - high_limit = info->high_limit; -retry: - if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) - return -ENOMEM; - - gap = vma_iter_end(&vmi) - info->length; - gap -= (gap - info->align_offset) & info->align_mask; - gap_end = vma_iter_end(&vmi); - tmp = vma_next(&vmi); - if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ - if (vm_start_gap(tmp) < gap_end) { - high_limit = vm_start_gap(tmp); - vma_iter_reset(&vmi); - goto retry; - } - } else { - tmp = vma_prev(&vmi); - if (tmp && vm_end_gap(tmp) > gap) { - high_limit = tmp->vm_start; - vma_iter_reset(&vmi); - goto retry; - } - } - - return gap; -} - /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. @@ -989,211 +941,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, return vma; } -/* - * Verify that the stack growth is acceptable and - * update accounting. This is shared with both the - * grow-up and grow-down cases. - */ -static int acct_stack_growth(struct vm_area_struct *vma, - unsigned long size, unsigned long grow) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long new_start; - - /* address space limit tests */ - if (!may_expand_vm(mm, vma->vm_flags, grow)) - return -ENOMEM; - - /* Stack limit test */ - if (size > rlimit(RLIMIT_STACK)) - return -ENOMEM; - - /* mlock limit tests */ - if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) - return -ENOMEM; - - /* Check to ensure the stack will not grow into a hugetlb-only region */ - new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : - vma->vm_end - size; - if (is_hugepage_only_range(vma->vm_mm, new_start, size)) - return -EFAULT; - - /* - * Overcommit.. This must be the final test, as it will - * update security statistics. - */ - if (security_vm_enough_memory_mm(mm, grow)) - return -ENOMEM; - - return 0; -} - -#if defined(CONFIG_STACK_GROWSUP) -/* - * PA-RISC uses this for its stack. - * vma is the last one with address > vma->vm_end. Have to extend vma. - */ -static int expand_upwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next; - unsigned long gap_addr; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - /* Guard against exceeding limits of the address space. */ - address &= PAGE_MASK; - if (address >= (TASK_SIZE & PAGE_MASK)) - return -ENOMEM; - address += PAGE_SIZE; - - /* Enforce stack_guard_gap */ - gap_addr = address + stack_guard_gap; - - /* Guard against overflow */ - if (gap_addr < address || gap_addr > TASK_SIZE) - gap_addr = TASK_SIZE; - - next = find_vma_intersection(mm, vma->vm_end, gap_addr); - if (next && vma_is_accessible(next)) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - /* Check that both stack segments have the same anon_vma? */ - } - - if (next) - vma_iter_prev_range_limit(&vmi, address); - - vma_iter_config(&vmi, vma->vm_start, address); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address > vma->vm_end) { - unsigned long size, grow; - - size = address - vma->vm_start; - grow = (address - vma->vm_end) >> PAGE_SHIFT; - - error = -ENOMEM; - if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_end = address; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} -#endif /* CONFIG_STACK_GROWSUP */ - -/* - * vma is the first one with address < vma->vm_start. Have to extend vma. - * mmap_lock held for writing. - */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *prev; - int error = 0; - VMA_ITERATOR(vmi, mm, vma->vm_start); - - if (!(vma->vm_flags & VM_GROWSDOWN)) - return -EFAULT; - - mmap_assert_write_locked(mm); - - address &= PAGE_MASK; - if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) - return -EPERM; - - /* Enforce stack_guard_gap */ - prev = vma_prev(&vmi); - /* Check that both stack segments have the same anon_vma? */ - if (prev) { - if (!(prev->vm_flags & VM_GROWSDOWN) && - vma_is_accessible(prev) && - (address - prev->vm_end < stack_guard_gap)) - return -ENOMEM; - } - - if (prev) - vma_iter_next_range_limit(&vmi, vma->vm_start); - - vma_iter_config(&vmi, address, vma->vm_end); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) { - vma_iter_free(&vmi); - return -ENOMEM; - } - - /* Lock the VMA before expanding to prevent concurrent page faults */ - vma_start_write(vma); - /* We update the anon VMA tree. */ - anon_vma_lock_write(vma->anon_vma); - - /* Somebody else might have raced and expanded it already */ - if (address < vma->vm_start) { - unsigned long size, grow; - - size = vma->vm_end - address; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - - error = -ENOMEM; - if (grow <= vma->vm_pgoff) { - error = acct_stack_growth(vma, size, grow); - if (!error) { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, grow); - anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_start = address; - vma->vm_pgoff -= grow; - /* Overwrite old entry in mtree. */ - vma_iter_store(&vmi, vma); - anon_vma_interval_tree_post_update_vma(vma); - - perf_event_mmap(vma); - } - } - } - anon_vma_unlock_write(vma->anon_vma); - vma_iter_free(&vmi); - validate_mm(mm); - return error; -} - /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; @@ -1325,58 +1072,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - unsigned long ret; - bool writable_file_mapping = false; - - /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) - return -EACCES; - - /* Allow architectures to sanity-check the vm_flags. */ - if (!arch_validate_flags(vm_flags)) - return -EINVAL; - - /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite(vm_flags)) { - int error = mapping_map_writable(file->f_mapping); - - if (error) - return error; - writable_file_mapping = true; - } - - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); - - /* Clear our write mapping regardless of error. */ - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); - - validate_mm(current->mm); - return ret; -} - -static int __vm_munmap(unsigned long start, size_t len, bool unlock) -{ - int ret; - struct mm_struct *mm = current->mm; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, start); - - if (mmap_write_lock_killable(mm)) - return -EINTR; - - ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); - if (ret || !unlock) - mmap_write_unlock(mm); - - userfaultfd_unmap_complete(mm, &uf); - return ret; -} - int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); @@ -1512,88 +1207,6 @@ out: return ret; } -/* - * do_brk_flags() - Increase the brk vma if the flags match. - * @vmi: The vma iterator - * @addr: The start address - * @len: The length of the increase - * @vma: The vma, - * @flags: The VMA Flags - * - * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags - * do not match then create a new anonymous VMA. Eventually we may be able to - * do some brk-specific accounting here. - */ -static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - - /* - * Check against address space limits by the changed size - * Note: This happens *after* clearing old mappings in some code paths. - */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) - return -ENOMEM; - - /* - * Expand the existing vma if possible; Note that singular lists do not - * occur after forking, so the expand will only happen on new VMAs. - */ - if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); - - vmg.prev = vma; - /* vmi is positioned at prev, which this mode expects. */ - vmg.merge_flags = VMG_FLAG_JUST_EXPAND; - - if (vma_merge_new_range(&vmg)) - goto out; - else if (vmg_nomem(&vmg)) - goto unacct_fail; - } - - if (vma) - vma_iter_next_range(vmi); - /* create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(mm); - if (!vma) - goto unacct_fail; - - vma_set_anonymous(vma); - vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); - vma_start_write(vma); - if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) - goto mas_store_fail; - - mm->map_count++; - validate_mm(mm); - ksm_add_vma(vma); -out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; - mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) - mm->locked_vm += (len >> PAGE_SHIFT); - vm_flags_set(vma, VM_SOFTDIRTY); - return 0; - -mas_store_fail: - vm_area_free(vma); -unacct_fail: - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; -} - int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; @@ -1664,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm) goto destroy; } - lru_add_drain(); flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ @@ -2107,7 +1719,6 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) vma, new_start, length, false, true)) return -ENOMEM; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { @@ -2132,3 +1743,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } + +#ifdef CONFIG_MMU +/* + * Obtain a read lock on mm->mmap_lock, if the specified address is below the + * start of the VMA, the intent is to perform a write, and it is a + * downward-growing stack, then attempt to expand the stack to contain it. + * + * This function is intended only for obtaining an argument page from an ELF + * image, and is almost certainly NOT what you want to use for any other + * purpose. + * + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a + * new VMA being mapped. + * + * The function assumes that addr is either contained within the VMA or below + * it, and makes no attempt to validate this value beyond that. + * + * Returns true if the read lock was obtained and a stack was perhaps expanded, + * false if the stack expansion failed. + * + * On stack expansion the function temporarily acquires an mmap write lock + * before downgrading it. + */ +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, + struct vm_area_struct *new_vma, + unsigned long addr, bool write) +{ + if (!write || addr >= new_vma->vm_start) { + mmap_read_lock(mm); + return true; + } + + if (!(new_vma->vm_flags & VM_GROWSDOWN)) + return false; + + mmap_write_lock(mm); + if (expand_downwards(new_vma, addr)) { + mmap_write_unlock(mm); + return false; + } + + mmap_write_downgrade(mm); + return true; +} +#else +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write) +{ + return false; +} +#endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f186d57df2c6..e7dbaf96aa17 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); -#ifdef CONFIG_MEMCG - -/* - * Size of the buffer for memcg path names. Ignoring stack trace support, - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. - */ -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - if (trace_mmap_lock_##type##_enabled()) { \ - char buf[MEMCG_PATH_BUF_SIZE]; \ - get_mm_memcg_path(mm, buf, sizeof(buf)); \ - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ - } \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - -#endif /* CONFIG_MEMCG */ - #ifdef CONFIG_TRACING -#ifdef CONFIG_MEMCG -/* - * Write the given mm_struct's memcg path to a buffer. If the path cannot be - * determined, empty string is written. - */ -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) -{ - struct mem_cgroup *memcg; - - buf[0] = '\0'; - memcg = get_mem_cgroup_from_mm(mm); - if (memcg == NULL) - return; - if (memcg->css.cgroup) - cgroup_path(memcg->css.cgroup, buf, buflen); - css_put(&memcg->css); -} - -#endif /* CONFIG_MEMCG */ - /* * Trace calls must be in a separate file, as otherwise there's a circular * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); + trace_mmap_lock_start_locking(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); + trace_mmap_lock_acquire_returned(mm, write, success); } EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) { - TRACE_MMAP_LOCK_EVENT(released, mm, write); + trace_mmap_lock_released(mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 99b3e9408aa0..7aa6f18c500b 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -311,11 +311,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb) } } -static void tlb_remove_table_one(void *table) +#ifdef CONFIG_PT_RECLAIM +static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) +{ + struct ptdesc *ptdesc; + + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + __tlb_remove_table(ptdesc); +} + +static inline void __tlb_remove_table_one(void *table) +{ + struct ptdesc *ptdesc; + + ptdesc = table; + call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu); +} +#else +static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } +#endif /* CONFIG_PT_RECLAIM */ + +static void tlb_remove_table_one(void *table) +{ + __tlb_remove_table_one(table); +} static void tlb_table_flush(struct mmu_gather *tlb) { diff --git a/mm/mseal.c b/mm/mseal.c index 81d6e980e8a9..c27197ac04e8 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) unsigned long end; struct mm_struct *mm = current->mm; - ret = can_do_mseal(flags); - if (ret) - return ret; + /* Verify flags not set. */ + if (flags) + return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) diff --git a/mm/numa.c b/mm/numa.c index e2eec07707d1..f1787d7713a6 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -37,13 +37,7 @@ void __init alloc_node_data(int nid) void __init alloc_offline_node_data(int nid) { pg_data_t *pgdat; - - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - - node_data[nid] = pgdat; + node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES); } /* Stub functions: */ diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c index 031fb9961bf7..9d55679d99ce 100644 --- a/mm/numa_emulation.c +++ b/mm/numa_emulation.c @@ -8,11 +8,12 @@ #include <linux/memblock.h> #include <linux/numa_memblks.h> #include <asm/numa.h> +#include <acpi/acpi_numa.h> #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -static int emu_nid_to_phys[MAX_NUMNODES]; +int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; int __init numa_emu_cmdline(char *str) @@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); int max_emu_nid, dfl_phys_nid; int i, j, ret; + nodemask_t physnode_mask = numa_nodes_parsed; if (!emu_cmdline) goto no_emu; @@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * split the system RAM into N fake nodes. */ if (strchr(emu_cmdline, 'U')) { - nodemask_t physnode_mask = numa_nodes_parsed; unsigned long n; int nid = 0; @@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) */ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); - /* commit */ - *numa_meminfo = ei; - /* Make sure numa_nodes_parsed only contains emulated nodes */ nodes_clear(numa_nodes_parsed); for (i = 0; i < ARRAY_SIZE(ei.blk); i++) @@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) ei.blk[i].nid != NUMA_NO_NODE) node_set(ei.blk[i].nid, numa_nodes_parsed); - numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); + /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision + * with faked numa nodes, particularly during later memory hotplug + * handling, and also update numa_nodes_parsed accordingly. + */ + ret = fix_pxm_node_maps(max_emu_nid); + if (ret < 0) + goto no_emu; + + /* commit */ + *numa_meminfo = ei; + + numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1); /* make sure all emulated nodes are mapped to a physical node */ - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + for (i = 0; i < max_emu_nid + 1; i++) if (emu_nid_to_phys[i] == NUMA_NO_NODE) emu_nid_to_phys[i] = dfl_phys_nid; @@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) numa_set_distance(i, j, dist); } } + for (i = 0; i < numa_distance_cnt; i++) { + for (j = 0; j < numa_distance_cnt; j++) { + int physi, physj; + u8 dist; + + /* distance between fake nodes is already ok */ + if (emu_nid_to_phys[i] != NUMA_NO_NODE && + emu_nid_to_phys[j] != NUMA_NO_NODE) + continue; + if (emu_nid_to_phys[i] != NUMA_NO_NODE) + physi = emu_nid_to_phys[i]; + else + physi = i - max_emu_nid; + if (emu_nid_to_phys[j] != NUMA_NO_NODE) + physj = emu_nid_to_phys[j]; + else + physj = j - max_emu_nid; + dist = phys_dist[physi * numa_dist_cnt + physj]; + numa_set_distance(i, j, dist); + } + } /* free the copied physical distance table */ memblock_free(phys_dist, phys_size); return; no_emu: + numa_nodes_parsed = physnode_mask; /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) emu_nid_to_phys[i] = i; diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index a3877e9bc878..ff4054f4334d 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -7,7 +7,7 @@ #include <linux/numa.h> #include <linux/numa_memblks.h> -static int numa_distance_cnt; +int numa_distance_cnt; static u8 *numa_distance; nodemask_t numa_nodes_parsed __initdata; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c485beb0b93..1cf121ad7085 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include <linux/init.h> #include <linux/mmu_notifier.h> #include <linux/cred.h> +#include <linux/nmi.h> #include <asm/tlb.h> #include "internal.h" @@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); else { struct task_struct *p; + int i = 0; rcu_read_lock(); - for_each_process(p) + for_each_process(p) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + touch_softlockup_watchdog(); dump_task(p, oc); + } rcu_read_unlock(); } } @@ -699,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk) } #ifdef CONFIG_SYSCTL -static struct ctl_table vm_oom_kill_table[] = { +static const struct ctl_table vm_oom_kill_table[] = { { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d9861e42b2bd..eb55ece39c56 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -942,26 +942,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); - wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); - if (wb_thresh > wb_max_thresh) - wb_thresh = wb_max_thresh; /* - * With strictlimit flag, the wb_thresh is treated as - * a hard limit in balance_dirty_pages() and wb_position_ratio(). - * It's possible that wb_thresh is close to zero, not because - * the device is slow, but because it has been inactive. - * To prevent occasional writes from being blocked, we raise wb_thresh. + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. */ - if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - unsigned long limit = hard_dirty_limit(dom, dtc->thresh); - u64 wb_scale_thresh = 0; - - if (limit > dtc->dirty) - wb_scale_thresh = (limit - dtc->dirty) / 100; - wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4)); + if (thresh > dtc->dirty) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100); + else + wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8); } + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + if (wb_thresh > wb_max_thresh) + wb_thresh = wb_max_thresh; + return wb_thresh; } @@ -969,6 +968,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + domain_dirty_avail(&gdtc, true); return __wb_calc_thresh(&gdtc, thresh); } @@ -1145,12 +1145,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long wb_pos_ratio; - if (dtc->wb_dirty < 8) { - dtc->pos_ratio = min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); - return; - } - if (dtc->wb_dirty >= wb_thresh) return; @@ -1222,14 +1216,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc) if (unlikely(wb_thresh > dtc->thresh)) wb_thresh = dtc->thresh; /* - * It's very possible that wb_thresh is close to 0 not because the - * device is slow, but that it has remained inactive for long time. - * Honour such devices a reasonable good (hopefully IO efficient) - * threshold, so that the occasional writes won't be blocked and active - * writes can rampup the threshold quickly. - */ - wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); - /* * scale global setpoint to wb's: * wb_setpoint = setpoint * wb_thresh / thresh */ @@ -1484,17 +1470,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use wb_dirty as * "dirty" and wb_setpoint as "setpoint". - * - * We rampup dirty_ratelimit forcibly if wb_dirty is low because - * it's possible that wb_thresh is close to zero due to inactivity - * of backing device. */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = dtc->wb_dirty; - if (dtc->wb_dirty < 8) - setpoint = dtc->wb_dirty + 1; - else - setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { @@ -2319,7 +2298,7 @@ static int page_writeback_cpu_online(unsigned int cpu) /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -static struct ctl_table vm_page_writeback_sysctls[] = { +static const struct ctl_table vm_page_writeback_sysctls[] = { { .procname = "dirty_background_ratio", .data = &dirty_background_ratio, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 01eab25edf89..542d25f77be8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1295,12 +1295,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, set_page_count(p, 0); } - /* - * Freeing the page with debug_pagealloc enabled will try to - * unmap it; some archs don't like double-unmappings, so - * map it first. - */ - debug_pagealloc_map_pages(page, nr_pages); adjust_managed_page_count(page, nr_pages); } else { for (loop = 0; loop < nr_pages; loop++, p++) { @@ -1508,7 +1502,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, int i; set_page_private(page, 0); - set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1856,6 +1849,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt) if (order >= pageblock_order) return true; + /* + * Movable pages won't cause permanent fragmentation, so when you alloc + * small pages, you just need to temporarily steal unmovable or + * reclaimable pages that are closest to the request size. After a + * while, memory compaction may occur to form large contiguous pages, + * and the next movable allocation may not need to steal. Unmovable and + * reclaimable allocations need to actually steal pages. + */ if (order >= pageblock_order / 2 || start_mt == MIGRATE_RECLAIMABLE || start_mt == MIGRATE_UNMOVABLE || @@ -2592,9 +2593,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, - struct page *page, int migratetype, - unsigned int order) +static void free_frozen_page_commit(struct zone *zone, + struct per_cpu_pages *pcp, struct page *page, int migratetype, + unsigned int order) { int high, batch; int pindex; @@ -2643,7 +2644,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, /* * Free a pcp page */ -void free_unref_page(struct page *page, unsigned int order) +void free_frozen_pages(struct page *page, unsigned int order) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2666,20 +2667,20 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ + zone = page_zone(page); migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } - zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, FPI_NONE); @@ -2743,7 +2744,7 @@ void free_unref_folios(struct folio_batch *folios) /* * Free isolated pages directly to the - * allocator, see comment in free_unref_page. + * allocator, see comment in free_frozen_pages. */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, @@ -2774,7 +2775,7 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, + free_frozen_page_commit(zone, pcp, &folio->page, migratetype, order); } @@ -3567,7 +3568,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, if (!page) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - return page; } @@ -4243,6 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: compaction_retries = 0; no_progress_loops = 0; + compact_result = COMPACT_SKIPPED; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); zonelist_iter_cookie = zonelist_iter_begin(); @@ -4531,28 +4532,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array * @gfp: GFP flags for the allocation * @preferred_nid: The preferred NUMA node ID to allocate from * @nodemask: Set of nodes to allocate from, may be NULL - * @nr_pages: The number of pages desired on the list or array - * @page_list: Optional list to store the allocated pages - * @page_array: Optional array to store the pages + * @nr_pages: The number of pages desired in the array + * @page_array: Array to store the pages * * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to page_list if page_list - * is not NULL, otherwise it is assumed that the page_array is valid. - * - * For lists, nr_pages is the number of pages that should be allocated. + * allocate nr_pages quickly. Pages are added to the page_array. * - * For arrays, only NULL elements are populated with pages and nr_pages + * Note that only NULL elements are populated with pages and nr_pages * is the maximum number of pages that will be stored in the array. * - * Returns the number of pages on the list or array. + * Returns the number of pages in the array. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array) { struct page *page; @@ -4570,7 +4566,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + while (nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; /* No pages requested? */ @@ -4578,7 +4574,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, goto out; /* Already populated array? */ - if (unlikely(page_array && nr_pages - nr_populated == 0)) + if (unlikely(nr_pages - nr_populated == 0)) goto out; /* Bulk allocator does not support memcg accounting. */ @@ -4660,7 +4656,7 @@ retry_this_zone: while (nr_populated < nr_pages) { /* Skip existing pages */ - if (page_array && page_array[nr_populated]) { + if (page_array[nr_populated]) { nr_populated++; continue; } @@ -4678,11 +4674,8 @@ retry_this_zone: nr_account++; prep_new_page(page, 0, gfp, 0); - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; + set_page_refcounted(page); + page_array[nr_populated++] = page; } pcp_spin_unlock(pcp); @@ -4699,14 +4692,8 @@ failed_irq: failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); - if (page) { - if (page_list) - list_add(&page->lru, page_list); - else - page_array[nr_populated] = page; - nr_populated++; - } - + if (page) + page_array[nr_populated++] = page; goto out; } EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); @@ -4714,8 +4701,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, - int preferred_nid, nodemask_t *nodemask) +struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4768,7 +4755,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, out: if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { - __free_pages(page, order); + free_frozen_pages(page, order); page = NULL; } @@ -4777,6 +4764,18 @@ out: return page; } +EXPORT_SYMBOL(__alloc_frozen_pages_noprof); + +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) +{ + struct page *page; + + page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask); + if (page) + set_page_refcounted(page); + return page; +} EXPORT_SYMBOL(__alloc_pages_noprof); struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, @@ -4837,11 +4836,11 @@ void __free_pages(struct page *page, unsigned int order) struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_unref_page(page, order); + free_frozen_pages(page, order); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_unref_page(page + (1 << order), order); + free_frozen_pages(page + (1 << order), order); } } EXPORT_SYMBOL(__free_pages); @@ -5161,13 +5160,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) zonerefs->zone_idx = 0; } -/* - * Build zonelists ordered by zone and nodes within zones. - * This results in conserving DMA zone[s] until all Normal memory is - * exhausted, but results in overflowing to remote node while memory - * may still exist in local DMA zone. - */ - static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; @@ -5858,11 +5850,10 @@ static void setup_per_zone_lowmem_reserve(void) for (j = i + 1; j < MAX_NR_ZONES; j++) { struct zone *upper_zone = &pgdat->node_zones[j]; - bool empty = !zone_managed_pages(upper_zone); managed_pages += zone_managed_pages(upper_zone); - if (clear || empty) + if (clear) zone->lowmem_reserve[j] = 0; else zone->lowmem_reserve[j] = managed_pages / ratio; @@ -6175,7 +6166,7 @@ out: return ret; } -static struct ctl_table page_alloc_sysctl_table[] = { +static const struct ctl_table page_alloc_sysctl_table[] = { { .procname = "min_free_kbytes", .data = &min_free_kbytes, @@ -6270,9 +6261,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list) * @migratetype: using migratetype to filter the type of migration in * trace_mm_alloc_contig_migrate_range_info. */ -int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end, - int migratetype) +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end, int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6281,7 +6271,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, int ret = 0; struct migration_target_control mtc = { .nid = zone_to_nid(cc->zone), - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + .gfp_mask = cc->gfp_mask, .reason = MR_CONTIG_RANGE, }; struct page *page; @@ -6351,7 +6341,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } -static void split_free_pages(struct list_head *list) +static void split_free_pages(struct list_head *list, gfp_t gfp_mask) { int order; @@ -6362,7 +6352,8 @@ static void split_free_pages(struct list_head *list) list_for_each_entry_safe(page, next, &list[order], lru) { int i; - post_alloc_hook(page, order, __GFP_MOVABLE); + post_alloc_hook(page, order, gfp_mask); + set_page_refcounted(page); if (!order) continue; @@ -6376,6 +6367,40 @@ static void split_free_pages(struct list_head *list) } } +static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) +{ + const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | + __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO; + const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + + /* + * We are given the range to allocate; node, mobility and placement + * hints are irrelevant at this point. We'll simply ignore them. + */ + gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE | + __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE); + + /* + * We only support most reclaim flags (but not NOFAIL/NORETRY), and + * selected action flags. + */ + if (gfp_mask & ~(reclaim_mask | action_mask)) + return -EINVAL; + + /* + * Flags to control page compaction/migration/reclaim, to free up our + * page range. Migratable pages are movable, __GFP_MOVABLE is implied + * for them. + * + * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that + * to not degrade callers. + */ + *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) | + __GFP_MOVABLE | __GFP_RETRY_MAYFAIL; + return 0; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6384,7 +6409,9 @@ static void split_free_pages(struct list_head *list) * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. - * @gfp_mask: GFP mask to use during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * * The PFN range does not have to be pageblock aligned. The PFN range must * belong to a single zone. @@ -6410,11 +6437,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .no_set_skip_hint = true, - .gfp_mask = current_gfp_context(gfp_mask), .alloc_contig = true, }; INIT_LIST_HEAD(&cc.migratepages); + gfp_mask = current_gfp_context(gfp_mask); + if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) + return -EINVAL; + /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may @@ -6436,7 +6466,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, * put back to page allocator so that buddy can use them. */ - ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); + ret = start_isolate_page_range(start, end, migratetype, 0); if (ret) goto done; @@ -6455,7 +6485,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; - ret = 0; + + /* + * When in-use hugetlb pages are migrated, they may simply be released + * back into the free hugepage pool instead of being returned to the + * buddy system. After the migration of in-use huge pages is completed, + * we will invoke replace_free_hugepage_folios() to ensure that these + * hugepages are properly released to the buddy system. + */ + ret = replace_free_hugepage_folios(start, end); + if (ret) + goto done; /* * Pages from [start, end) are within a pageblock_nr_pages @@ -6489,7 +6529,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, } if (!(gfp_mask & __GFP_COMP)) { - split_free_pages(cc.freepages); + split_free_pages(cc.freepages, gfp_mask); /* Free head and tail (if any) */ if (start != outer_start) @@ -6502,6 +6542,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); + set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", @@ -6556,7 +6597,9 @@ static bool zone_spans_last_pfn(const struct zone *zone, /** * alloc_contig_pages() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction + * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some + * action and reclaim modifiers are supported. Reclaim modifiers + * control allocation behavior during compaction/migration/reclaim. * @nid: Target node * @nodemask: Mask for other possible nodes * @@ -6961,7 +7004,7 @@ static inline bool has_unaccepted_memory(void) static bool cond_accept_memory(struct zone *zone, unsigned int order) { - long to_accept; + long to_accept, wmark; bool ret = false; if (!has_unaccepted_memory()) @@ -6970,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; + wmark = promo_wmark_pages(zone); + + /* + * Watermarks have not been initialized yet. + * + * Accepting one MAX_ORDER page to ensure progress. + */ + if (!wmark) + return try_to_accept_memory_one(zone); + /* How much to accept to get to promo watermark? */ - to_accept = promo_wmark_pages(zone) - + to_accept = wmark - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); diff --git a/mm/page_counter.c b/mm/page_counter.c index b249d15af9dd..af23f927611b 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -288,7 +288,7 @@ int page_counter_memparse(const char *buf, const char *max, } -#ifdef CONFIG_MEMCG +#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its @@ -460,4 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root, atomic_long_read(&parent->children_low_usage), recursive_protection)); } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */ diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c index 3f7a203d35c6..d2423f30577e 100644 --- a/mm/page_frag_cache.c +++ b/mm/page_frag_cache.c @@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -138,7 +138,7 @@ refill: goto refill; if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { - free_unref_page(page, + free_frozen_pages(page, encoded_page_decode_order(encoded_page)); goto refill; } @@ -166,6 +166,6 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/page_idle.c b/mm/page_idle.c index 41ea77f22011..947c7c7a3728 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -112,7 +112,7 @@ static void page_idle_clear_pte_refs(struct folio *folio) } static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { u64 *out = (u64 *)buf; @@ -157,7 +157,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, } static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { const u64 *in = (u64 *)buf; @@ -193,17 +193,17 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, return (char *)in - buf; } -static struct bin_attribute page_idle_bitmap_attr = +static const struct bin_attribute page_idle_bitmap_attr = __BIN_ATTR(bitmap, 0600, page_idle_bitmap_read, page_idle_bitmap_write, 0); -static struct bin_attribute *page_idle_bin_attrs[] = { +static const struct bin_attribute *const page_idle_bin_attrs[] = { &page_idle_bitmap_attr, NULL, }; static const struct attribute_group page_idle_attr_group = { - .bin_attrs = page_idle_bin_attrs, + .bin_attrs_new = page_idle_bin_attrs, .name = "page_idle", }; diff --git a/mm/page_io.c b/mm/page_io.c index 4b4ea8e49cf6..9b983de351f9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -163,7 +163,6 @@ reprobe: page_no = 1; /* force Empty message */ sis->max = page_no; sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; out: return ret; bad_bmap: diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7e04047977cf..a051a29e95ad 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -286,7 +286,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * within a free or in-use page. * @boundary_pfn: pageblock-aligned pfn that a page might cross * @flags: isolation flags - * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() @@ -306,8 +305,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation, - int migratetype) + bool isolate_before, bool skip_isolation, int migratetype) { unsigned long start_pfn; unsigned long isolate_pageblock; @@ -444,8 +442,6 @@ failed: * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range - * @gfp_flags: GFP flags used for migrating pages that sit across the - * range boundaries. * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the @@ -478,7 +474,7 @@ failed: * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags) + int migratetype, int flags) { unsigned long pfn; struct page *page; @@ -489,7 +485,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + ret = isolate_single_pageblock(isolate_start, flags, false, skip_isolation, migratetype); if (ret) return ret; @@ -498,7 +494,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + ret = isolate_single_pageblock(isolate_end, flags, true, skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); @@ -612,6 +608,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int ret; /* + * Due to the deferred freeing of hugetlb folios, the hugepage folios may + * not immediately release to the buddy system. This can cause PageBuddy() + * to fail in __test_page_isolated_in_pageblock(). To ensure that the + * hugetlb folios are properly released back to the buddy system, we + * invoke the wait_for_freed_hugetlb_folios() function to wait for the + * release to complete. + */ + wait_for_freed_hugetlb_folios(); + + /* * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free * pages are not aligned to pageblock_nr_pages. * Then we just check migratetype first. diff --git a/mm/percpu.c b/mm/percpu.c index d8dd31a2e407..7b5835356d1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); - chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); - chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->alloc_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); - chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->bound_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); - chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->md_blocks) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); - + chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; @@ -2595,28 +2582,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); - group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_offsets) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); - group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_sizes) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); - unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); - unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_off) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2685,12 +2660,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; - pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); - if (!pcpu_chunk_lists) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); @@ -3099,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3155,25 +3127,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!p4d) - goto err_alloc; + p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!pud) - goto err_alloc; + pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!pmd) - goto err_alloc; + pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); pud_populate(&init_mm, pud, pmd); } @@ -3181,16 +3147,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (!pmd_present(*pmd)) { pte_t *new; - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; + new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmd, new); } return; - -err_alloc: - panic("%s: Failed to allocate memory\n", __func__); } /** @@ -3237,10 +3198,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); - if (!pages) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pages_size); + pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; @@ -3282,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c new file mode 100644 index 000000000000..7e9455a18aae --- /dev/null +++ b/mm/pt_reclaim.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/hugetlb.h> +#include <asm-generic/tlb.h> +#include <asm/pgalloc.h> + +#include "internal.h" + +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (!spin_trylock(pml)) + return false; + + *pmdval = pmdp_get_lockless(pmd); + pmd_clear(pmd); + spin_unlock(pml); + + return true; +} + +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval) +{ + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); +} + +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb) +{ + pmd_t pmdval; + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + /* Check if it is empty PTE page */ + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + free_pte(mm, addr, tlb, pmdval); + + return; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); +} diff --git a/mm/readahead.c b/mm/readahead.c index e151f4b13ca4..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac) if (aops->readahead) { aops->readahead(rac); - /* - * Clean up the remaining folios. The sizes in ->ra - * may be used to size the next readahead, so make sure - * they accurately reflect what happened. - */ + /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { - unsigned long nr = folio_nr_pages(folio); - folio_get(folio); - rac->ra->size -= nr; - if (rac->ra->async_size >= nr) { - rac->ra->async_size -= nr; - filemap_remove_folio(folio); - } + filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } @@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac) BUG_ON(readahead_count(rac)); } +static struct folio *ractl_alloc_folio(struct readahead_control *ractl, + gfp_t gfp_mask, unsigned int order) +{ + struct folio *folio; + + folio = filemap_alloc_folio(gfp_mask, order); + if (folio && ractl->dropbehind) + __folio_set_dropbehind(folio); + + return folio; +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - folio = filemap_alloc_folio(gfp_mask, - mapping_min_folio_order(mapping)); + folio = ractl_alloc_folio(ractl, gfp_mask, + mapping_min_folio_order(mapping)); if (!folio) break; @@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; - struct folio *folio = filemap_alloc_folio(gfp, order); + struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; @@ -458,7 +460,8 @@ void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; - pgoff_t index = readahead_index(ractl); + pgoff_t start = readahead_index(ractl); + pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; @@ -516,12 +519,18 @@ void page_cache_ra_order(struct readahead_control *ractl, /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this - * situation. + * situation below. */ if (!err) return; fallback: - do_page_cache_ra(ractl, ra->size, ra->async_size); + /* + * ->readahead() may have updated readahead window size so we have to + * check there's still something to read. + */ + if (ra->size > index - start) + do_page_cache_ra(ractl, ra->size - (index - start), + ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, @@ -754,7 +763,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; @@ -783,7 +792,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 6d783436951f..e7173fcd210c 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -12,7 +12,8 @@ #include <linux/mm.h> #include <asm/sections.h> -static const int rodata_test_data = 0xC3; +#define TEST_VALUE 0xC3 +static const int rodata_test_data = TEST_VALUE; void rodata_test(void) { @@ -20,7 +21,7 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ - if (!rodata_test_data) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test 1 fails (start data)\n"); return; } @@ -33,7 +34,7 @@ void rodata_test(void) } /* test 3: check the value hasn't changed */ - if (rodata_test_data == zero) { + if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) { pr_err("test data was changed\n"); return; } diff --git a/mm/secretmem.c b/mm/secretmem.c index 399552814fd0..1b0a214ee558 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -195,14 +195,13 @@ static struct file *secretmem_file_create(unsigned long flags) struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; - const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); - err = security_inode_init_security_anon(inode, &qname, NULL); + err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; diff --git a/mm/shmem.c b/mm/shmem.c index 532afd8e049c..ab61c8bb20e1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -553,38 +553,105 @@ static bool shmem_confirm_swap(struct address_space *mapping, /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +/** + * shmem_mapping_size_orders - Get allowable folio orders for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @write_end: end of a write, could extend inode size. + * + * This returns huge orders for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Return: The orders. + */ +static inline unsigned int +shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) +{ + unsigned int order; + size_t size; + + if (!mapping_large_folio_support(mapping) || !write_end) + return 0; + + /* Calculate the write size based on the write_end */ + size = write_end - (index << PAGE_SHIFT); + order = filemap_get_order(size); + if (!order) + return 0; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_PAGECACHE_ORDER); + return order > 0 ? BIT(order + 1) - 1 : 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + unsigned int order; + pgoff_t aligned_index; loff_t i_size; - if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) - return false; if (!S_ISREG(inode->i_mode)) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs mmap()'s huge order, we still use PMD-sized order to + * allocate huge pages due to lack of a write size hint. + * + * Otherwise, tmpfs will allow getting a highest order hint based on + * the size of write and fallocate paths, then will try each allowable + * huge orders. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + if (vma) + return maybe_pmd_order; + + return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = max(write_end, i_size_read(inode)); - i_size = round_up(i_size, PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + if (vma) + within_size_orders = maybe_pmd_order; + else + within_size_orders = shmem_mapping_size_orders(inode->i_mapping, + index, write_end); + + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) - return true; + return maybe_pmd_order; fallthrough; default: - return false; + return 0; } } @@ -779,11 +846,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, return 0; } -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - unsigned long vm_flags) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { - return false; + return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1180,7 +1248,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_huge_global_enabled(inode, 0, 0, false, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1480,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) @@ -1690,22 +1758,18 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; pgoff_t aligned_index; - bool global_huge; + unsigned int global_orders; loff_t i_size; int order; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; - global_huge = shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vm_flags); - if (!vma || !vma_is_anon_shmem(vma)) { - /* - * For tmpfs, we now only support PMD sized THP if huge page - * is enabled, otherwise fallback to order 0. - */ - return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; - } + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; /* * Following the 'deny' semantics of the top level, force the huge @@ -1737,7 +1801,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); - if (global_huge) + if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; @@ -1903,6 +1967,65 @@ unlock: return ERR_PTR(error); } +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new; + void *shadow; + int nr_pages; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } + + new = shmem_alloc_folio(gfp, order, info, index); + if (!new) + return ERR_PTR(-ENOMEM); + + nr_pages = folio_nr_pages(new); + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + gfp, entry)) { + folio_put(new); + return ERR_PTR(-ENOMEM); + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + return ERR_PTR(-EEXIST); + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +} + /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of @@ -2006,7 +2129,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; @@ -2022,7 +2146,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2126,8 +2251,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; + bool skip_swapcache = false; swp_entry_t swap; - int error, nr_pages; + int error, nr_pages, order, split_order; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -2146,8 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int split_order; + bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ if (fault_type) { @@ -2157,6 +2284,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + */ + if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled())) + fallback_order0 = true; + + /* Skip swapcache for synchronous device. */ + if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + if (!IS_ERR(folio)) { + skip_swapcache = true; + goto alloced; + } + + /* + * Fallback to swapin order-0 folio unless the swap entry + * already exists. + */ + error = PTR_ERR(folio); + folio = NULL; + if (error == -EEXIST) + goto failed; + } + + /* * Now swap device can only swap in order 0 folio, then we * should split the large swap entry stored in the pagecache * if necessary. @@ -2184,13 +2338,38 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } +alloced: /* We have to do this with folio locked to prevent races */ folio_lock(folio); - if (!folio_test_swapcache(folio) || + if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } @@ -2224,7 +2403,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + delete_from_swap_cache(folio); + } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); put_swap_device(si); @@ -2235,8 +2419,11 @@ failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: + if (skip_swapcache) + swapcache_clear(si, swap, folio_nr_pages(folio)); if (folio) { folio_unlock(folio); folio_put(folio); @@ -2752,12 +2939,6 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ @@ -3329,7 +3510,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -3356,7 +3537,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3443,7 +3624,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); @@ -3731,16 +3912,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, @@ -4585,48 +4766,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -4893,7 +5063,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -4971,7 +5146,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif @@ -5444,6 +5619,21 @@ static int __init setup_transparent_hugepage_shmem(char *str) } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 4a85b94d12ce..794bd433cce0 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -195,8 +195,6 @@ int shrinker_debugfs_add(struct shrinker *shrinker) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { - struct dentry *entry; - char buf[128]; const char *new, *old; va_list ap; int ret = 0; @@ -213,18 +211,8 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) old = shrinker->name; shrinker->name = new; - if (shrinker->debugfs_entry) { - snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, - shrinker->debugfs_id); - - entry = debugfs_rename(shrinker_debugfs_root, - shrinker->debugfs_entry, - shrinker_debugfs_root, buf); - if (IS_ERR(entry)) - ret = PTR_ERR(entry); - else - shrinker->debugfs_entry = entry; - } + ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d", + shrinker->name, shrinker->debugfs_id); mutex_unlock(&shrinker_mutex); diff --git a/mm/slab.h b/mm/slab.h index 632fedd71fea..05a21dc796e0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -128,7 +128,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) /** * slab_folio - The folio allocated for a slab - * @slab: The slab. + * @s: The slab. * * Slabs are allocated as folios that contain the individual objects and are * using some fields in the first struct page of the folio - those fields are @@ -159,7 +159,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) /** * slab_page - The first struct page allocated for a slab - * @slab: The slab. + * @s: The slab. * * A convenience wrapper for converting slab to the first struct page of the * underlying folio, to communicate with code not yet converted to folio or @@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } -/* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ - SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ + SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) -#else -#define SLAB_DEBUG_FLAGS (0) -#endif -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | \ - SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) - -/* Common flags available with current configuration */ -#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) - -/* Common flags permitted for kmem_cache_create */ -#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ - SLAB_RED_ZONE | \ - SLAB_POISON | \ - SLAB_STORE_USER | \ - SLAB_TRACE | \ - SLAB_CONSISTENCY_CHECKS | \ - SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | \ - SLAB_ACCOUNT | \ - SLAB_KMALLOC | \ - SLAB_NO_MERGE | \ - SLAB_NO_USER_FLAGS) +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); @@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index a29457bef626..5be257e03c7c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -28,7 +28,9 @@ #include <asm/page.h> #include <linux/memcontrol.h> #include <linux/stackdepot.h> +#include <trace/events/rcu.h> +#include "../kernel/rcu/rcu.h" #include "internal.h" #include "slab.h" @@ -296,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER) stack_depot_init(); +#else + flags &= ~SLAB_DEBUG_FLAGS; #endif mutex_lock(&slab_mutex); @@ -305,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } - /* - * Some allocators will constraint the set of valid flags to a subset - * of all flags. We expect them to define CACHE_CREATE_MASK in this - * case, and we'll just provide them with a sanitized version of the - * passed flags. - */ - flags &= CACHE_CREATE_MASK; - /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || WARN_ON(!args->usersize && args->useroffset) || @@ -1282,3 +1277,908 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifndef CONFIG_KVFREE_RCU_BATCHED + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, kvfree_rcu_cb); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ + +/* + * This rcu parameter is runtime-read-only. It reflects + * a minimum allowed number of objects which can be cached + * per-CPU. Object size is equal to one page. This value + * can be changed at boot time. + */ +static int rcu_min_cached_objs = 5; +module_param(rcu_min_cached_objs, int, 0444); + +// A page shrinker can ask for pages to be freed to make them +// available for other parts of the system. This usually happens +// under low memory conditions, and in that case we should also +// defer page-cache filling for a short time period. +// +// The default value is 5 seconds, which is long enough to reduce +// interference with the shrinker while it asks other systems to +// drain their caches. +static int rcu_delay_page_cache_fill_msec = 5000; +module_param(rcu_delay_page_cache_fill_msec, int, 0444); + +static struct workqueue_struct *rcu_reclaim_wq; + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (5 * HZ) +#define KFREE_N_BATCHES 2 +#define FREE_N_CHANNELS 2 + +/** + * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk + * @nr_records: Number of active pointers in the array + * @records: Array of the kvfree_rcu() pointers + */ +struct kvfree_rcu_bulk_data { + struct list_head list; + struct rcu_gp_oldstate gp_snap; + unsigned long nr_records; + void *records[] __counted_by(nr_records); +}; + +/* + * This macro defines how many entries the "records" array + * will contain. It is based on the fact that the size of + * kvfree_rcu_bulk_data structure becomes exactly one page. + */ +#define KVFREE_BULK_MAX_ENTR \ + ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) + +/** + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct rcu_gp_oldstate head_free_gp_snap; + struct list_head bulk_head_free[FREE_N_CHANNELS]; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @initialized: The @rcu_work fields have been initialized + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @page_cache_work: A work to refill the cache when it is empty + * @backoff_page_cache_fill: Delay cache refills + * @work_in_progress: Indicates that page_cache_work is running + * @hrtimer: A hrtimer for scheduling a page_cache_work + * @nr_bkv_objs: number of allocated objects at @bkvcache. + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + // Objects queued on a linked list + // through their rcu_head structures. + struct rcu_head *head; + unsigned long head_gp_snap; + atomic_t head_count; + + // Objects queued on a bulk-list. + struct list_head bulk_head[FREE_N_CHANNELS]; + atomic_t bulk_count[FREE_N_CHANNELS]; + + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; + raw_spinlock_t lock; + struct delayed_work monitor_work; + bool initialized; + + struct delayed_work page_cache_work; + atomic_t backoff_page_cache_fill; + atomic_t work_in_progress; + struct hrtimer hrtimer; + + struct llist_head bkvcache; + int nr_bkv_objs; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), +}; + +static __always_inline void +debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + int i; + + for (i = 0; i < bhead->nr_records; i++) + debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); +#endif +} + +static inline struct kfree_rcu_cpu * +krc_this_cpu_lock(unsigned long *flags) +{ + struct kfree_rcu_cpu *krcp; + + local_irq_save(*flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + raw_spin_lock(&krcp->lock); + + return krcp; +} + +static inline void +krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static inline struct kvfree_rcu_bulk_data * +get_cached_bnode(struct kfree_rcu_cpu *krcp) +{ + if (!krcp->nr_bkv_objs) + return NULL; + + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); + return (struct kvfree_rcu_bulk_data *) + llist_del_first(&krcp->bkvcache); +} + +static inline bool +put_cached_bnode(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode) +{ + // Check the limit. + if (krcp->nr_bkv_objs >= rcu_min_cached_objs) + return false; + + llist_add((struct llist_node *) bnode, &krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); + return true; +} + +static int +drain_page_cache(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + struct llist_node *page_list, *pos, *n; + int freed = 0; + + if (!rcu_min_cached_objs) + return 0; + + raw_spin_lock_irqsave(&krcp->lock, flags); + page_list = llist_del_all(&krcp->bkvcache); + WRITE_ONCE(krcp->nr_bkv_objs, 0); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + llist_for_each_safe(pos, n, page_list) { + free_page((unsigned long)pos); + freed++; + } + + return freed; +} + +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode, int idx) +{ + unsigned long flags; + int i; + + if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { + debug_rcu_bhead_unqueue(bnode); + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + "slab", bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + "slab", bnode->records[i], 0); + + vfree(bnode->records[i]); + } + } + rcu_lock_release(&rcu_callback_map); + } + + raw_spin_lock_irqsave(&krcp->lock, flags); + if (put_cached_bnode(krcp, bnode)) + bnode = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (bnode) + free_page((unsigned long) bnode); + + cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ + struct rcu_head *next; + + for (; head; head = next) { + void *ptr = (void *) head->func; + unsigned long offset = (void *) head - ptr; + + next = head->next; + debug_rcu_head_unqueue((struct rcu_head *)ptr); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kvfree_callback("slab", head, offset); + + kvfree(ptr); + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->bulk_head_free or ->head_free. + */ +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct kvfree_rcu_bulk_data *bnode, *n; + struct list_head bulk_head[FREE_N_CHANNELS]; + struct rcu_head *head; + struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; + struct rcu_gp_oldstate head_gp_snap; + int i; + + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; + + raw_spin_lock_irqsave(&krcp->lock, flags); + // Channels 1 and 2. + for (i = 0; i < FREE_N_CHANNELS; i++) + list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); + + // Channel 3. + head = krwp->head_free; + krwp->head_free = NULL; + head_gp_snap = krwp->head_free_gp_snap; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + // Handle the first two channels. + for (i = 0; i < FREE_N_CHANNELS; i++) { + // Start from the tail page, so a GP is likely passed for it. + list_for_each_entry_safe(bnode, n, &bulk_head[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + /* + * This is used when the "bulk" path can not be used for the + * double-argument of kvfree_rcu(). This happens when the + * page-cache is empty, which means that objects are instead + * queued on a linked list through their rcu_head structures. + * This list is named "Channel 3". + */ + if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) + kvfree_rcu_list(head); +} + +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krcp->bulk_head[i])) + return true; + + return !!READ_ONCE(krcp->head); +} + +static bool +need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (!list_empty(&krwp->bulk_head_free[i])) + return true; + + return !!krwp->head_free; +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ + int sum = atomic_read(&krcp->head_count); + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + sum += atomic_read(&krcp->bulk_count[i]); + + return sum; +} + +static void +__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); +} + +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&krcp->lock, flags); + __schedule_delayed_monitor_work(krcp); + raw_spin_unlock_irqrestore(&krcp->lock, flags); +} + +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ + struct list_head bulk_ready[FREE_N_CHANNELS]; + struct kvfree_rcu_bulk_data *bnode, *n; + struct rcu_head *head_ready = NULL; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&krcp->lock, flags); + for (i = 0; i < FREE_N_CHANNELS; i++) { + INIT_LIST_HEAD(&bulk_ready[i]); + + list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { + if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) + break; + + atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); + list_move(&bnode->list, &bulk_ready[i]); + } + } + + if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { + head_ready = krcp->head; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + for (i = 0; i < FREE_N_CHANNELS; i++) { + list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + if (head_ready) + kvfree_rcu_list(head_ready); +} + +/* + * Return: %true if a work is queued, %false otherwise. + */ +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) +{ + unsigned long flags; + bool queued = false; + int i, j; + + raw_spin_lock_irqsave(&krcp->lock, flags); + + // Attempt to start a new batch. + for (i = 0; i < KFREE_N_BATCHES; i++) { + struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); + + // Try to detach bulk_head or head and attach it, only when + // all channels are free. Any channel is not free means at krwp + // there is on-going rcu work to handle krwp's free business. + if (need_wait_for_krwp_work(krwp)) + continue; + + // kvfree_rcu_drain_ready() might handle this krcp, if so give up. + if (need_offload_krc(krcp)) { + // Channel 1 corresponds to the SLAB-pointer bulk path. + // Channel 2 corresponds to vmalloc-pointer bulk path. + for (j = 0; j < FREE_N_CHANNELS; j++) { + if (list_empty(&krwp->bulk_head_free[j])) { + atomic_set(&krcp->bulk_count[j], 0); + list_replace_init(&krcp->bulk_head[j], + &krwp->bulk_head_free[j]); + } + } + + // Channel 3 corresponds to both SLAB and vmalloc + // objects queued on the linked list. + if (!krwp->head_free) { + krwp->head_free = krcp->head; + get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + + // One work is per one batch, so there are three + // "free channels", the batch can handle. Break + // the loop since it is done with this CPU thus + // queuing an RCU work is _always_ success here. + queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work); + WARN_ON_ONCE(!queued); + break; + } + } + + raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); + + // If there is nothing to detach, it means that our job is + // successfully done here. In case of having at least one + // of the channels that is still busy we should rearm the + // work to repeat an attempt. Because previous batches are + // still in progress. + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); +} + +static void fill_page_cache_func(struct work_struct *work) +{ + struct kvfree_rcu_bulk_data *bnode; + struct kfree_rcu_cpu *krcp = + container_of(work, struct kfree_rcu_cpu, + page_cache_work.work); + unsigned long flags; + int nr_pages; + bool pushed; + int i; + + nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? + 1 : rcu_min_cached_objs; + + for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!bnode) + break; + + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; + } + } + + atomic_set(&krcp->work_in_progress, 0); + atomic_set(&krcp->backoff_page_cache_fill, 0); +} + +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags. If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback. +static inline bool +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, + unsigned long *flags, void *ptr, bool can_alloc) +{ + struct kvfree_rcu_bulk_data *bnode; + int idx; + + *krcp = krc_this_cpu_lock(flags); + if (unlikely(!(*krcp)->initialized)) + return false; + + idx = !!is_vmalloc_addr(ptr); + bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], + struct kvfree_rcu_bulk_data, list); + + /* Check if a new block is required. */ + if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(*krcp); + if (!bnode && can_alloc) { + krc_this_cpu_unlock(*krcp, *flags); + + // __GFP_NORETRY - allows a light-weight direct reclaim + // what is OK from minimizing of fallback hitting point of + // view. Apart of that it forbids any OOM invoking what is + // also beneficial since we are about to release memory soon. + // + // __GFP_NOMEMALLOC - prevents from consuming of all the + // memory reserves. Please note we have a fallback path. + // + // __GFP_NOWARN - it is supposed that an allocation can + // be failed under low memory or high memory pressure + // scenarios. + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + raw_spin_lock_irqsave(&(*krcp)->lock, *flags); + } + + if (!bnode) + return false; + + // Initialize the new block and attach it. + bnode->nr_records = 0; + list_add(&bnode->list, &(*krcp)->bulk_head[idx]); + } + + // Finally insert and update the GP for this page. + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; + get_state_synchronize_rcu_full(&bnode->gp_snap); + atomic_inc(&(*krcp)->bulk_count[idx]); + + return true; +} + +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); + return HRTIMER_NORESTART; +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + // If cache disabled, bail out. + if (!rcu_min_cached_objs) + return; + + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + if (atomic_read(&krcp->backoff_page_cache_fill)) { + queue_delayed_work(rcu_reclaim_wq, + &krcp->page_cache_work, + msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); + } else { + hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } + } +} + +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + if (need_offload_krc(krcp)) + schedule_delayed_monitor_work(krcp); + } +} + +/* + * Queue a request for lazy invocation of the appropriate free routine + * after a grace period. Please note that three paths are maintained, + * two for the common case using arrays of pointers and a third one that + * is used only when the main paths cannot be used, for example, due to + * memory pressure. + * + * Each kvfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will + * be free'd in workqueue context. This allows us to: batch requests together to + * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + bool success; + + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + if (!head) + might_sleep(); + + // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(ptr)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + + // Mark as success and leave. + return; + } + + kasan_record_aux_stack(ptr); + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); + if (!success) { + run_page_cache_worker(krcp); + + if (head == NULL) + // Inline if kvfree_rcu(one_arg) call. + goto unlock_return; + + head->func = ptr; + head->next = krcp->head; + WRITE_ONCE(krcp->head, head); + atomic_inc(&krcp->head_count); + + // Take a snapshot for this krcp. + krcp->head_gp_snap = get_state_synchronize_rcu(); + success = true; + } + + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) + __schedule_delayed_monitor_work(krcp); + +unlock_return: + krc_this_cpu_unlock(krcp, flags); + + /* + * Inline kvfree() after synchronize_rcu(). We can do + * it from might_sleep() context only, so the current + * CPU can pass the QS state. + */ + if (!success) { + debug_rcu_head_unqueue((struct rcu_head *) ptr); + synchronize_rcu(); + kvfree(ptr); + } +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += krc_count(krcp); + count += READ_ONCE(krcp->nr_bkv_objs); + atomic_set(&krcp->backoff_page_cache_fill, 1); + } + + return count == 0 ? SHRINK_EMPTY : count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + + for_each_possible_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krc_count(krcp); + count += drain_page_cache(krcp); + kfree_rcu_monitor(&krcp->monitor_work.work); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed == 0 ? SHRINK_STOP : freed; +} + +void __init kvfree_rcu_init(void) +{ + int cpu; + int i, j; + struct shrinker *kfree_rcu_shrinker; + + rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_reclaim_wq); + + /* Clamp it to [0:100] seconds interval. */ + if (rcu_delay_page_cache_fill_msec < 0 || + rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { + + rcu_delay_page_cache_fill_msec = + clamp(rcu_delay_page_cache_fill_msec, 0, + (int) (100 * MSEC_PER_SEC)); + + pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", + rcu_delay_page_cache_fill_msec); + } + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); + krcp->krw_arr[i].krcp = krcp; + + for (j = 0; j < FREE_N_CHANNELS; j++) + INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); + } + + for (i = 0; i < FREE_N_CHANNELS; i++) + INIT_LIST_HEAD(&krcp->bulk_head[i]); + + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); + krcp->initialized = true; + } + + kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); +} + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + diff --git a/mm/slub.c b/mm/slub.c index c2151c9fee22..5eac408e818e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> @@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); @@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ @@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) + object - s->red_left_pad, val, s->red_left_pad, ret)) ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) + endobject, val, s->inuse - s->object_size, ret)) ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { @@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { + val, s->object_size - orig_size, ret)) { ret = 0; } } @@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size)) + s->inuse - s->object_size, ret)) ret = 0; } } @@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) + s->object_size - kasan_meta_size - 1, ret)) ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) + p + s->object_size - 1, POISON_END, 1, ret)) ret = 0; } /* @@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab, ret = 0; } - if (!ret && !slab_in_kunit_test()) { - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - } - return ret; } @@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -2311,7 +2334,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, * We have to do this manually because the rcu_head is * not located inside the object. */ - kasan_record_aux_stack_noalloc(x); + kasan_record_aux_stack(x); delayed_free->object = x; call_rcu(&delayed_free->head, slab_free_after_rcu_debug); @@ -2420,17 +2443,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_pages(flags, order); + folio = (struct folio *)alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_pages_node(node, flags, order); + folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); if (!folio) return NULL; slab = folio_slab(folio); __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); @@ -2651,12 +2672,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __slab_clear_pfmemalloc(slab); folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); __folio_clear_slab(folio); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - __free_pages(&folio->page, order); + free_frozen_pages(&folio->page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -4245,6 +4264,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -4720,6 +4740,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4729,9 +4754,55 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() @@ -4882,6 +4953,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc_noprof); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -5574,14 +5807,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) return !!oo_objects(s->oo); } -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5596,6 +5829,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5616,8 +5851,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); @@ -7513,10 +7747,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) return -ENOMEM; } - if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) - alloc = TRACK_ALLOC; - else - alloc = TRACK_FREE; + alloc = debugfs_get_aux_num(filep); if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { bitmap_free(obj_map); @@ -7572,11 +7803,11 @@ static void debugfs_slab_add(struct kmem_cache *s) slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); - debugfs_create_file("alloc_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s, + TRACK_ALLOC, &slab_debugfs_fops); - debugfs_create_file("free_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s, + TRACK_FREE, &slab_debugfs_fops); } void debugfs_slab_release(struct kmem_cache *s) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index cec67c5f37d8..3287ebadd167 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -31,6 +31,8 @@ #include <asm/dma.h> #include <asm/pgalloc.h> +#include "internal.h" + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping. @@ -42,8 +44,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_alloc_try_nid_raw(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); + return memmap_alloc(size, align, goal, node, false); } void * __meminit vmemmap_alloc_block(unsigned long size, int node) diff --git a/mm/sparse.c b/mm/sparse.c index 13b6624d3562..133b033d0cba 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,10 +257,7 @@ static void __init memblocks_present(void) size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); + mem_section = memblock_alloc_or_panic(size, align); } #endif diff --git a/mm/swap.c b/mm/swap.c index 10decd9dffa1..fc8281ef4241 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -109,7 +109,7 @@ void __folio_put(struct folio *folio) page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); + free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); @@ -379,37 +379,58 @@ static void __lru_cache_activate_folio(struct folio *folio) } #ifdef CONFIG_LRU_GEN -static void folio_inc_refs(struct folio *folio) + +static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - folio_set_referenced(folio); - return; - } - - if (!folio_test_workingset(folio)) { - folio_set_workingset(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } - /* see the comment on MAX_NR_TIERS */ do { - new_flags = old_flags & LRU_REFS_MASK; - if (new_flags == LRU_REFS_MASK) - break; + if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { + if (!folio_test_workingset(folio)) + folio_set_workingset(folio); + return; + } - new_flags += BIT(LRU_REFS_PGOFF); - new_flags |= old_flags & ~LRU_REFS_MASK; + new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } -#else -static void folio_inc_refs(struct folio *folio) + +static bool lru_gen_clear_refs(struct folio *folio) +{ + struct lru_gen_folio *lrugen; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + + if (gen < 0) + return true; + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + + lrugen = &folio_lruvec(folio)->lrugen; + /* whether can do without shuffling under the LRU lock */ + return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); +} + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_inc_refs(struct folio *folio) +{ +} + +static bool lru_gen_clear_refs(struct folio *folio) { + return false; } + #endif /* CONFIG_LRU_GEN */ /** @@ -427,8 +448,10 @@ static void folio_inc_refs(struct folio *folio) */ void folio_mark_accessed(struct folio *folio) { + if (folio_test_dropbehind(folio)) + return; if (lru_gen_enabled()) { - folio_inc_refs(folio); + lru_gen_inc_refs(folio); return; } @@ -474,7 +497,7 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_add_folio() */ + /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); @@ -524,7 +547,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { - bool active = folio_test_active(folio); + bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) @@ -589,7 +612,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_del_folio(lruvec, folio); folio_clear_active(folio); - folio_clear_referenced(folio); + if (lru_gen_enabled()) + lru_gen_clear_refs(folio); + else + folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal @@ -657,6 +683,9 @@ void deactivate_file_folio(struct folio *folio) if (folio_test_unevictable(folio)) return; + if (lru_gen_enabled() && lru_gen_clear_refs(folio)) + return; + folio_batch_add_and_move(folio, lru_deactivate_file, true); } @@ -670,7 +699,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + if (folio_test_unevictable(folio)) + return; + + if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index da1278f0563b..1007c30f12e2 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -6,149 +6,107 @@ #include <linux/swapops.h> /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); -struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; +/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */ +#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short)) +#define ID_SHIFT (BITS_PER_TYPE(unsigned short)) +#define ID_MASK (BIT(ID_SHIFT) - 1) struct swap_cgroup { - unsigned short id; + atomic_t ids; }; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) -{ - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; - - ctrl = &swap_cgroup_ctrl[type]; - - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - if (!(idx % SWAP_CLUSTER_MAX)) - cond_resched(); - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); +struct swap_cgroup_ctrl { + struct swap_cgroup *map; +}; - return -ENOMEM; -} +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; -static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, - pgoff_t offset) +static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map, + pgoff_t offset) { - struct page *mappage; - struct swap_cgroup *sc; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids); + + BUILD_BUG_ON(!is_power_of_2(ID_PER_SC)); + BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t)); - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; + return (old_ids >> shift) & ID_MASK; } -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) +static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, + pgoff_t offset, + unsigned short new_id) { - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - return __lookup_swap_cgroup(ctrl, offset); + unsigned short old_id; + struct swap_cgroup *sc = &map[offset / ID_PER_SC]; + unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT; + unsigned int new_ids, old_ids = atomic_read(&sc->ids); + + do { + old_id = (old_ids >> shift) & ID_MASK; + new_ids = (old_ids & ~(ID_MASK << shift)); + new_ids |= ((unsigned int)new_id) << shift; + } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids)); + + return old_id; } /** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id + * swap_cgroup_record - record mem_cgroup for a set of swap entries. + * These entries must belong to one single folio, and that folio + * must be being charged for swap space (swap out), and these + * entries must not have been charged * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) + * @folio: the folio that the swap entry belongs to + * @id: mem_cgroup ID to be recorded + * @ent: the first swap entry to be recorded */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) +void swap_cgroup_record(struct folio *folio, unsigned short id, + swp_entry_t ent) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; + unsigned int nr_ents = folio_nr_pages(folio); + struct swap_cgroup *map; + pgoff_t offset, end; + unsigned short old; + + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; + + do { + old = __swap_cgroup_id_xchg(map, offset, id); + VM_BUG_ON(old); + } while (++offset != end); } /** - * swap_cgroup_record - record mem_cgroup for a set of swap entries + * swap_cgroup_clear - clear mem_cgroup for a set of swap entries. + * These entries must be being uncharged from swap. They either + * belongs to one single folio in the swap cache (swap in for + * cgroup v1), or no longer have any users (slot freeing). + * * @ent: the first swap entry to be recorded into - * @id: mem_cgroup to be recorded * @nr_ents: number of swap entries to be recorded * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) + * Returns the existing old value. */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; pgoff_t offset = swp_offset(ent); pgoff_t end = offset + nr_ents; + struct swap_cgroup *map; + unsigned short old, iter = 0; - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - for (;;) { - VM_BUG_ON(sc->id != old); - sc->id = id; - offset++; - if (offset == end) - break; - if (offset % SC_PER_PAGE) - sc++; - else - sc = __lookup_swap_cgroup(ctrl, offset); - } - spin_unlock_irqrestore(&ctrl->lock, flags); + offset = swp_offset(ent); + end = offset + nr_ents; + map = swap_cgroup_ctrl[swp_type(ent)].map; + + do { + old = __swap_cgroup_id_xchg(map, offset, 0); + if (!iter) + iter = old; + VM_BUG_ON(iter != old); + } while (++offset != end); return old; } @@ -161,39 +119,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) return 0; - return lookup_swap_cgroup(ent, NULL)->id; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent)); } int swap_cgroup_swapon(int type, unsigned long max_pages) { - void *array; - unsigned long length; + struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); - - array = vcalloc(length, sizeof(void *)); - if (!array) + BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC != + sizeof(struct swap_cgroup)); + map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) * + sizeof(struct swap_cgroup)); + if (!map) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } + ctrl->map = map; mutex_unlock(&swap_cgroup_mutex); return 0; @@ -205,8 +157,7 @@ nomem: void swap_cgroup_swapoff(int type) { - struct page **map; - unsigned long i, length; + struct swap_cgroup *map; struct swap_cgroup_ctrl *ctrl; if (mem_cgroup_disabled()) @@ -215,19 +166,8 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; - length = ctrl->length; ctrl->map = NULL; - ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - if (!(i % SWAP_CLUSTER_MAX)) - cond_resched(); - } - vfree(map); - } + vfree(map); } diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 13ab3b771409..9c7c171df7ba 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -43,17 +43,15 @@ static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ static DEFINE_MUTEX(swap_slots_cache_enable_mutex); -static void __drain_swap_slots_cache(unsigned int type); +static void __drain_swap_slots_cache(void); #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) -#define SLOTS_CACHE 0x1 -#define SLOTS_CACHE_RET 0x2 static void deactivate_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_mutex); swap_slot_cache_active = false; - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); mutex_unlock(&swap_slots_cache_mutex); } @@ -72,7 +70,7 @@ void disable_swap_slots_cache_lock(void) if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ cpus_read_lock(); - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); cpus_read_unlock(); } } @@ -113,7 +111,7 @@ out: static int alloc_swap_slot_cache(unsigned int cpu) { struct swap_slots_cache *cache; - swp_entry_t *slots, *slots_ret; + swp_entry_t *slots; /* * Do allocation outside swap_slots_cache_mutex @@ -125,28 +123,19 @@ static int alloc_swap_slot_cache(unsigned int cpu) if (!slots) return -ENOMEM; - slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), - GFP_KERNEL); - if (!slots_ret) { - kvfree(slots); - return -ENOMEM; - } - mutex_lock(&swap_slots_cache_mutex); cache = &per_cpu(swp_slots, cpu); - if (cache->slots || cache->slots_ret) { + if (cache->slots) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); kvfree(slots); - kvfree(slots_ret); return 0; } if (!cache->lock_initialized) { mutex_init(&cache->alloc_lock); - spin_lock_init(&cache->free_lock); cache->lock_initialized = true; } cache->nr = 0; @@ -160,19 +149,16 @@ static int alloc_swap_slot_cache(unsigned int cpu) */ mb(); cache->slots = slots; - cache->slots_ret = slots_ret; mutex_unlock(&swap_slots_cache_mutex); return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots) { struct swap_slots_cache *cache; - swp_entry_t *slots = NULL; cache = &per_cpu(swp_slots, cpu); - if ((type & SLOTS_CACHE) && cache->slots) { + if (cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); cache->cur = 0; @@ -183,20 +169,9 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } mutex_unlock(&cache->alloc_lock); } - if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { - spin_lock_irq(&cache->free_lock); - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - if (free_slots && cache->slots_ret) { - slots = cache->slots_ret; - cache->slots_ret = NULL; - } - spin_unlock_irq(&cache->free_lock); - kvfree(slots); - } } -static void __drain_swap_slots_cache(unsigned int type) +static void __drain_swap_slots_cache(void) { unsigned int cpu; @@ -224,13 +199,13 @@ static void __drain_swap_slots_cache(unsigned int type) * There are no slots on such cpu that need to be drained. */ for_each_online_cpu(cpu) - drain_slots_cache_cpu(cpu, type, false); + drain_slots_cache_cpu(cpu, false); } static int free_slot_cache(unsigned int cpu) { mutex_lock(&swap_slots_cache_mutex); - drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + drain_slots_cache_cpu(cpu, true); mutex_unlock(&swap_slots_cache_mutex); return 0; } @@ -269,39 +244,6 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) return cache->nr; } -void free_swap_slot(swp_entry_t entry) -{ - struct swap_slots_cache *cache; - - /* Large folio swap slot is not covered. */ - zswap_invalidate(entry); - - cache = raw_cpu_ptr(&swp_slots); - if (likely(use_swap_slot_cache && cache->slots_ret)) { - spin_lock_irq(&cache->free_lock); - /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache || !cache->slots_ret) { - spin_unlock_irq(&cache->free_lock); - goto direct_free; - } - if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { - /* - * Return slots to global pool. - * The current swap_map value is SWAP_HAS_CACHE. - * Set it to 0 to indicate it is available for - * allocation in global pool - */ - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - } - cache->slots_ret[cache->n_ret++] = entry; - spin_unlock_irq(&cache->free_lock); - } else { -direct_free: - swapcache_free_entries(&entry, 1); - } -} - swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index e0c0321b8ff7..ca42b2be64d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; - lru_add_drain(); folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); diff --git a/mm/swapfile.c b/mm/swapfile.c index b0a9071cfe1d..df7c4e8b089c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,15 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages); -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset); -static void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci); +static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset); +static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -129,6 +129,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } +/* + * Use the second highest bit of inuse_pages counter as the indicator + * if one swap device is on the available plist, so the atomic can + * still be updated arithmetically while having special data embedded. + * + * inuse_pages counter is the only thing indicating if a device should + * be on avail_lists or not (except swapon / swapoff). By embedding the + * off-list bit in the atomic counter, updates no longer need any lock + * to check the list status. + * + * This bit will be set if the device is not on the plist and not + * usable, will be cleared if the device is on the plist. + */ +#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) +#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) +static long swap_usage_in_pages(struct swap_info_struct *si) +{ + return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; +} + /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* @@ -222,9 +242,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); if (!need_reclaim) goto out_unlock; @@ -242,12 +262,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); - spin_lock(&si->lock); - /* Only sinple page folio can be backed by zswap */ - if (nr_pages == 1) - zswap_invalidate(entry); - swap_entry_range_free(si, entry, nr_pages); - spin_unlock(&si->lock); + ci = lock_cluster(si, offset); + swap_entry_range_free(si, ci, entry, nr_pages); + unlock_cluster(ci); ret = nr_pages; out_unlock: folio_unlock(folio); @@ -382,9 +399,23 @@ static void discard_swap_cluster(struct swap_info_struct *si, #endif #define LATENCY_LIMIT 256 -static inline bool cluster_is_free(struct swap_cluster_info *info) +static inline bool cluster_is_empty(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_FREE; + return info->count == 0; +} + +static inline bool cluster_is_discard(struct swap_cluster_info *info) +{ + return info->flags == CLUSTER_FLAG_DISCARD; +} + +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) +{ + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) + return false; + if (!order) + return true; + return cluster_is_empty(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, @@ -393,6 +424,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } +static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { @@ -404,45 +441,38 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si { struct swap_cluster_info *ci; - ci = si->cluster_info; - if (ci) { - ci += offset / SWAPFILE_CLUSTER; - spin_lock(&ci->lock); - } + ci = offset_to_cluster(si, offset); + spin_lock(&ci->lock); + return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { - if (ci) - spin_unlock(&ci->lock); + spin_unlock(&ci->lock); } -/* - * Determine the locking method in use for this device. Return - * swap_cluster_info if SSD-style cluster-based locking is in place. - */ -static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset) +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags) { - struct swap_cluster_info *ci; + VM_WARN_ON(ci->flags == new_flags); - /* Try to use fine-grained SSD-style locking if available: */ - ci = lock_cluster(si, offset); - /* Otherwise, fall back to traditional, coarse locking: */ - if (!ci) - spin_lock(&si->lock); - - return ci; -} + BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); + lockdep_assert_held(&ci->lock); -static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci) -{ - if (ci) - unlock_cluster(ci); + spin_lock(&si->lock); + if (ci->flags == CLUSTER_FLAG_NONE) + list_add_tail(&ci->list, list); else - spin_unlock(&si->lock); + list_move_tail(&ci->list, list); + spin_unlock(&si->lock); + + if (ci->flags == CLUSTER_FLAG_FRAG) + atomic_long_dec(&si->frag_cluster_nr[ci->order]); + else if (new_flags == CLUSTER_FLAG_FRAG) + atomic_long_inc(&si->frag_cluster_nr[ci->order]); + ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ @@ -458,51 +488,98 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - list_move_tail(&ci->list, &si->discard_clusters); - ci->flags = 0; + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); + move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&si->lock); lockdep_assert_held(&ci->lock); - - if (ci->flags) - list_move_tail(&ci->list, &si->free_clusters); - else - list_add_tail(&ci->list, &si->free_clusters); - ci->flags = CLUSTER_FLAG_FREE; + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } /* + * Isolate and lock the first cluster that is not contented on a list, + * clean its flag before taken off-list. Cluster flag must be in sync + * with list status, so cluster updaters can always know the cluster + * list status without touching si lock. + * + * Note it's possible that all clusters on a list are contented so + * this returns NULL for an non-empty list. + */ +static struct swap_cluster_info *isolate_lock_cluster( + struct swap_info_struct *si, struct list_head *list) +{ + struct swap_cluster_info *ci, *ret = NULL; + + spin_lock(&si->lock); + + if (unlikely(!(si->flags & SWP_WRITEOK))) + goto out; + + list_for_each_entry(ci, list, list) { + if (!spin_trylock(&ci->lock)) + continue; + + /* We may only isolate and clear flags of following lists */ + VM_BUG_ON(!ci->flags); + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && + ci->flags != CLUSTER_FLAG_FULL); + + list_del(&ci->list); + ci->flags = CLUSTER_FLAG_NONE; + ret = ci; + break; + } +out: + spin_unlock(&si->lock); + + return ret; +} + +/* * Doing discard actually. After a cluster discard is finished, the cluster - * will be added to free cluster list. caller should hold si->lock. -*/ -static void swap_do_scheduled_discard(struct swap_info_struct *si) + * will be added to free cluster list. Discard cluster is a bit special as + * they don't participate in allocation or reclaim, so clusters marked as + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. + */ +static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; + bool ret = false; unsigned int idx; + spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + /* + * Delete the cluster from list to prepare for discard, but keep + * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster + * pointing to it, or ran into by relocate_cluster. + */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); - discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); - spin_lock(&si->lock); spin_lock(&ci->lock); - __free_cluster(si, ci); + /* + * Discard is done, clear its flags as it's off-list, then + * return the cluster to allocation list. + */ + ci->flags = CLUSTER_FLAG_NONE; memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); + __free_cluster(si, ci); spin_unlock(&ci->lock); + ret = true; + spin_lock(&si->lock); } + spin_unlock(&si->lock); + return ret; } static void swap_discard_work(struct work_struct *work) @@ -511,9 +588,7 @@ static void swap_discard_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, discard_work); - spin_lock(&si->lock); swap_do_scheduled_discard(si); - spin_unlock(&si->lock); } static void swap_users_ref_free(struct percpu_ref *ref) @@ -524,15 +599,16 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } +/* + * Must be called after freeing if ci->count == 0, moves the cluster to free + * or discard list. + */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); - lockdep_assert_held(&si->lock); + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -548,6 +624,49 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info * } /* + * Must be called after freeing if ci->count != 0, moves the cluster to + * nonfull list. + */ +static void partial_free_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); + lockdep_assert_held(&ci->lock); + + if (ci->flags != CLUSTER_FLAG_NONFULL) + move_cluster(si, ci, &si->nonfull_clusters[ci->order], + CLUSTER_FLAG_NONFULL); +} + +/* + * Must be called after allocation, moves the cluster to full or frag list. + * Note: allocation doesn't acquire si lock, and may drop the ci lock for + * reclaim, so the cluster could be any where when called. + */ +static void relocate_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + lockdep_assert_held(&ci->lock); + + /* Discard cluster must remain off-list or on discard list */ + if (cluster_is_discard(ci)) + return; + + if (!ci->count) { + if (ci->flags != CLUSTER_FLAG_FREE) + free_cluster(si, ci); + } else if (ci->count != SWAPFILE_CLUSTER) { + if (ci->flags != CLUSTER_FLAG_FRAG) + move_cluster(si, ci, &si->frag_clusters[ci->order], + CLUSTER_FLAG_FRAG); + } else { + if (ci->flags != CLUSTER_FLAG_FULL) + move_cluster(si, ci, &si->full_clusters, + CLUSTER_FLAG_FULL); + } +} + +/* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. @@ -558,9 +677,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si, unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - if (!cluster_info) - return; - ci = cluster_info + idx; ci->count++; @@ -568,63 +684,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si, VM_BUG_ON(ci->flags); } -/* - * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, - * which means no page in the cluster is in use, we can optionally discard - * the cluster and add it to free cluster list. - */ -static void dec_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *ci, int nr_pages) -{ - if (!si->cluster_info) - return; - - VM_BUG_ON(ci->count < nr_pages); - VM_BUG_ON(cluster_is_free(ci)); - lockdep_assert_held(&si->lock); - lockdep_assert_held(&ci->lock); - ci->count -= nr_pages; - - if (!ci->count) { - free_cluster(si, ci); - return; - } - - if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { - VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } -} - static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; - unsigned long offset; + unsigned long offset = start; + int nr_reclaim; spin_unlock(&ci->lock); - spin_unlock(&si->lock); - - for (offset = start; offset < end; offset++) { + do { switch (READ_ONCE(map[offset])) { case 0: - continue; + offset++; + break; case SWAP_HAS_CACHE: - if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) - continue; - goto out; + nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + if (nr_reclaim > 0) + offset += nr_reclaim; + else + goto out; + break; default: goto out; } - } + } while (offset < end); out: - spin_lock(&si->lock); spin_lock(&ci->lock); - /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. @@ -638,11 +724,11 @@ out: static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, - unsigned long start, unsigned int nr_pages) + unsigned long start, unsigned int nr_pages, + bool *need_reclaim) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; - bool need_reclaim = false; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { @@ -651,16 +737,13 @@ static bool cluster_scan_range(struct swap_info_struct *si, case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; - need_reclaim = true; + *need_reclaim = true; continue; default: return false; } } - if (need_reclaim) - return cluster_reclaim_range(si, ci, start, end); - return true; } @@ -670,73 +753,79 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster { unsigned int nr_pages = 1 << order; + lockdep_assert_held(&ci->lock); + if (!(si->flags & SWP_WRITEOK)) return false; - if (cluster_is_free(ci)) { - if (nr_pages < SWAPFILE_CLUSTER) { - list_move_tail(&ci->list, &si->nonfull_clusters[order]); - ci->flags = CLUSTER_FLAG_NONFULL; - } + /* + * The first allocation in a cluster makes the + * cluster exclusive to this order + */ + if (cluster_is_empty(ci)) ci->order = order; - } memset(si->swap_map + start, usage, nr_pages); - swap_range_alloc(si, start, nr_pages); + swap_range_alloc(si, nr_pages); ci->count += nr_pages; - if (ci->count == SWAPFILE_CLUSTER) { - VM_BUG_ON(!(ci->flags & - (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); - if (ci->flags & CLUSTER_FLAG_FRAG) - si->frag_cluster_nr[ci->order]--; - list_move_tail(&ci->list, &si->full_clusters); - ci->flags = CLUSTER_FLAG_FULL; - } - return true; } -static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, - unsigned int *foundp, unsigned int order, +/* Try use a new cluster for current CPU and allocate from it. */ +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long offset, + unsigned int order, unsigned char usage) { - unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; + unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; - struct swap_cluster_info *ci; + bool need_reclaim, ret; - if (end < nr_pages) - return SWAP_NEXT_INVALID; - end -= nr_pages; + lockdep_assert_held(&ci->lock); - ci = lock_cluster(si, offset); - if (ci->count + nr_pages > SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } + if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) + goto out; - while (offset <= end) { - if (cluster_scan_range(si, ci, offset, nr_pages)) { - if (!cluster_alloc_range(si, ci, offset, usage, order)) { - offset = SWAP_NEXT_INVALID; - goto done; - } - *foundp = offset; - if (ci->count == SWAPFILE_CLUSTER) { - offset = SWAP_NEXT_INVALID; - goto done; - } - offset += nr_pages; - break; + for (end -= nr_pages; offset <= end; offset += nr_pages) { + need_reclaim = false; + if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) + continue; + if (need_reclaim) { + ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); + /* + * Reclaim drops ci->lock and cluster could be used + * by another order. Not checking flag as off-list + * cluster has no flag set, and change of list + * won't cause fragmentation. + */ + if (!cluster_is_usable(ci, order)) + goto out; + if (cluster_is_empty(ci)) + offset = start; + /* Reclaim failed but cluster is usable, try next */ + if (!ret) + continue; } + if (!cluster_alloc_range(si, ci, offset, usage, order)) + break; + found = offset; offset += nr_pages; + if (ci->count < SWAPFILE_CLUSTER && offset <= end) + next = offset; + break; } - if (offset > end) - offset = SWAP_NEXT_INVALID; -done: +out: + relocate_cluster(si, ci); unlock_cluster(ci); - return offset; + if (si->flags & SWP_SOLIDSTATE) + __this_cpu_write(si->percpu_cluster->next[order], next); + else + si->global_cluster->next[order] = next; + return found; } /* Return true if reclaimed a whole cluster */ @@ -749,20 +838,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) int nr_reclaim; if (force) - to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while (!list_empty(&si->full_clusters)) { - ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->full_clusters); + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; - spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; @@ -770,8 +858,12 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) } offset++; } - spin_lock(&si->lock); + /* in case no swap cache is reclaimed */ + if (ci->flags == CLUSTER_FLAG_NONE) + relocate_cluster(si, ci); + + unlock_cluster(ci); if (to_scan <= 0) break; } @@ -783,9 +875,7 @@ static void swap_reclaim_work(struct work_struct *work) si = container_of(work, struct swap_info_struct, reclaim_work); - spin_lock(&si->lock); swap_reclaim_full_clusters(si, true); - spin_unlock(&si->lock); } /* @@ -796,29 +886,41 @@ static void swap_reclaim_work(struct work_struct *work) static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { - struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int offset, found = 0; -new_cluster: - lockdep_assert_held(&si->lock); - cluster = this_cpu_ptr(si->percpu_cluster); - offset = cluster->next[order]; + if (si->flags & SWP_SOLIDSTATE) { + /* Fast path using per CPU cluster */ + local_lock(&si->percpu_cluster->lock); + offset = __this_cpu_read(si->percpu_cluster->next[order]); + } else { + /* Serialize HDD SWAP allocation for each device. */ + spin_lock(&si->global_cluster_lock); + offset = si->global_cluster->next[order]; + } + if (offset) { - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + ci = lock_cluster(si, offset); + /* Cluster could have been used by another order */ + if (cluster_is_usable(ci, order)) { + if (cluster_is_empty(ci)) + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, + order, usage); + } else { + unlock_cluster(ci); + } if (found) goto done; } - if (!list_empty(&si->free_clusters)) { - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); - /* - * Either we didn't touch the cluster due to swapoff, - * or the allocation must success. - */ - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); - goto done; +new_cluster: + ci = isolate_lock_cluster(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; } /* Try reclaim from full clusters if free clusters list is drained */ @@ -826,56 +928,42 @@ new_cluster: swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0; - - while (!list_empty(&si->nonfull_clusters[order])) { - ci = list_first_entry(&si->nonfull_clusters[order], - struct swap_cluster_info, list); - list_move_tail(&ci->list, &si->frag_clusters[order]); - ci->flags = CLUSTER_FLAG_FRAG; - si->frag_cluster_nr[order]++; - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; + unsigned int frags = 0, frags_existing; + + while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); if (found) - break; + goto done; + /* Clusters failed to allocate are moved to frag_clusters */ + frags++; } - if (!found) { + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); + while (frags < frags_existing && + (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { + atomic_long_dec(&si->frag_cluster_nr[order]); /* - * Nonfull clusters are moved to frag tail if we reached - * here, count them too, don't over scan the frag list. + * Rotate the frag list to iterate, they were all + * failing high order allocation or moved here due to + * per-CPU usage, but they could contain newly released + * reclaimable (eg. lazy-freed swap cache) slots. */ - while (frags < si->frag_cluster_nr[order]) { - ci = list_first_entry(&si->frag_clusters[order], - struct swap_cluster_info, list); - /* - * Rotate the frag list to iterate, they were all failing - * high order allocation or moved here due to per-CPU usage, - * this help keeping usable cluster ahead. - */ - list_move_tail(&ci->list, &si->frag_clusters[order]); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + frags++; } } - if (found) - goto done; - - if (!list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); + /* + * We don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; - } if (order) goto done; @@ -886,74 +974,151 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while (!list_empty(&si->frag_clusters[o])) { - ci = list_first_entry(&si->frag_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { + atomic_long_dec(&si->frag_cluster_nr[o]); + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } - while (!list_empty(&si->nonfull_clusters[o])) { - ci = list_first_entry(&si->nonfull_clusters[o], - struct swap_cluster_info, list); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, 0, usage); + while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + 0, usage); if (found) goto done; } } - done: - cluster->next[order] = offset; + if (si->flags & SWP_SOLIDSTATE) + local_unlock(&si->percpu_cluster->lock); + else + spin_unlock(&si->global_cluster_lock); return found; } -static void __del_from_avail_list(struct swap_info_struct *si) +/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ +static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { int nid; + unsigned long pages; + + spin_lock(&swap_avail_lock); + + if (swapoff) { + /* + * Forcefully remove it. Clear the SWP_WRITEOK flags for + * swapoff here so it's synchronized by both si->lock and + * swap_avail_lock, to ensure the result can be seen by + * add_to_avail_list. + */ + lockdep_assert_held(&si->lock); + si->flags &= ~SWP_WRITEOK; + atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + } else { + /* + * If not called by swapoff, take it off-list only if it's + * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly + * si->inuse_pages == pages), any concurrent slot freeing, + * or device already removed from plist by someone else + * will make this return false. + */ + pages = si->pages; + if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } - assert_spin_locked(&si->lock); for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + +skip: + spin_unlock(&swap_avail_lock); } -static void del_from_avail_list(struct swap_info_struct *si) +/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ +static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { + int nid; + long val; + unsigned long pages; + spin_lock(&swap_avail_lock); - __del_from_avail_list(si); + + /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ + if (swapon) { + lockdep_assert_held(&si->lock); + si->flags |= SWP_WRITEOK; + } else { + if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) + goto skip; + } + + if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) + goto skip; + + val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); + + /* + * When device is full and device is on the plist, only one updater will + * see (inuse_pages == si->pages) and will call del_from_avail_list. If + * that updater happen to be here, just skip adding. + */ + pages = si->pages; + if (val == pages) { + /* Just like the cmpxchg in del_from_avail_list */ + if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, + pages | SWAP_USAGE_OFFLIST_BIT)) + goto skip; + } + + for_each_node(nid) + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + +skip: spin_unlock(&swap_avail_lock); } -static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, - unsigned int nr_entries) +/* + * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock + * within each cluster, so the total contribution to the global counter should + * always be positive and cannot exceed the total number of usable slots. + */ +static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) { - unsigned int end = offset + nr_entries - 1; + long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); - if (offset == si->lowest_bit) - si->lowest_bit += nr_entries; - if (end == si->highest_bit) - WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); - WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - del_from_avail_list(si); - - if (si->cluster_info && vm_swap_full()) - schedule_work(&si->reclaim_work); + /* + * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, + * remove it from the plist. + */ + if (unlikely(val == si->pages)) { + del_from_avail_list(si, false); + return true; } + + return false; } -static void add_to_avail_list(struct swap_info_struct *si) +static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) { - int nid; + long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); - spin_lock(&swap_avail_lock); - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); - spin_unlock(&swap_avail_lock); + /* + * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, + * remove it from the plist. + */ + if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) + add_to_avail_list(si, false); +} + +static void swap_range_alloc(struct swap_info_struct *si, + unsigned int nr_entries) +{ + if (swap_usage_add(si, nr_entries)) { + if (vm_swap_full()) + schedule_work(&si->reclaim_work); + } } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, @@ -968,18 +1133,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); - - if (offset < si->lowest_bit) - si->lowest_bit = offset; - if (end > si->highest_bit) { - bool was_full = !si->highest_bit; - - WRITE_ONCE(si->highest_bit, end); - if (was_full && (si->flags & SWP_WRITEOK)) - add_to_avail_list(si); + zswap_invalidate(swp_entry(si->type, offset + i)); } + if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -999,50 +1157,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); - WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); -} - -static void set_cluster_next(struct swap_info_struct *si, unsigned long next) -{ - unsigned long prev; - - if (!(si->flags & SWP_SOLIDSTATE)) { - si->cluster_next = next; - return; - } - - prev = this_cpu_read(*si->cluster_next_cpu); - /* - * Cross the swap address space size aligned trunk, choose - * another trunk randomly to avoid lock contention on swap - * address space if possible. - */ - if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != - (next >> SWAP_ADDRESS_SPACE_SHIFT)) { - /* No free swap slots available */ - if (si->highest_bit <= si->lowest_bit) - return; - next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); - next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); - next = max_t(unsigned int, next, si->lowest_bit); - } - this_cpu_write(*si->cluster_next_cpu, next); -} - -static bool swap_offset_available_and_locked(struct swap_info_struct *si, - unsigned long offset) -{ - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); - return true; - } - - if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - return true; - } - - return false; + swap_usage_sub(si, nr_entries); } static int cluster_alloc_swap(struct swap_info_struct *si, @@ -1051,10 +1166,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, { int n_ret = 0; - VM_BUG_ON(!si->cluster_info); - - si->flags += SWP_SCANNING; - while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); @@ -1063,8 +1174,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, slots[n_ret++] = swp_entry(si->type, offset); } - si->flags -= SWP_SCANNING; - return n_ret; } @@ -1072,13 +1181,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - unsigned long offset; - unsigned long scan_base; - unsigned long last_in_cluster = 0; - int latency_ration = LATENCY_LIMIT; unsigned int nr_pages = 1 << order; - int n_ret = 0; - bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -1090,7 +1193,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ - if (order > 0) { /* * Should not even be attempting large allocations when huge @@ -1103,165 +1205,30 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } /* - * Swapfile is not block device or not using clusters so unable + * Swapfile is not block device so unable * to allocate large entries. */ - if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + if (!(si->flags & SWP_BLKDEV)) return 0; } - if (si->cluster_info) - return cluster_alloc_swap(si, usage, nr, slots, order); - - si->flags += SWP_SCANNING; - - /* For HDD, sequential access is more important. */ - scan_base = si->cluster_next; - offset = scan_base; - - if (unlikely(!si->cluster_nr--)) { - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - - spin_unlock(&si->lock); - - /* - * If seek is expensive, start searching for new cluster from - * start of partition, to minimize the span of allocated swap. - */ - scan_base = offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - - offset = scan_base; - spin_lock(&si->lock); - si->cluster_nr = SWAPFILE_CLUSTER - 1; - } - -checks: - if (!(si->flags & SWP_WRITEOK)) - goto no_page; - if (!si->highest_bit) - goto no_page; - if (offset > si->highest_bit) - scan_base = offset = si->lowest_bit; - - /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - int swap_was_freed; - spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - /* entry was freed successfully, try to use this again */ - if (swap_was_freed > 0) - goto checks; - goto scan; /* check next one */ - } - - if (si->swap_map[offset]) { - if (!n_ret) - goto scan; - else - goto done; - } - memset(si->swap_map + offset, usage, nr_pages); - - swap_range_alloc(si, offset, nr_pages); - slots[n_ret++] = swp_entry(si->type, offset); - - /* got enough slots or reach max slots? */ - if ((n_ret == nr) || (offset >= si->highest_bit)) - goto done; - - /* search for next available slot */ - - /* time to take a break? */ - if (unlikely(--latency_ration < 0)) { - if (n_ret) - goto done; - spin_unlock(&si->lock); - cond_resched(); - spin_lock(&si->lock); - latency_ration = LATENCY_LIMIT; - } - - if (si->cluster_nr && !si->swap_map[++offset]) { - /* non-ssd case, still more slots in cluster? */ - --si->cluster_nr; - goto checks; - } + return cluster_alloc_swap(si, usage, nr, slots, order); +} +static bool get_swap_device_info(struct swap_info_struct *si) +{ + if (!percpu_ref_tryget_live(&si->users)) + return false; /* - * Even if there's no free clusters available (fragmented), - * try to scan a little more quickly with lock held unless we - * have scanned too many slots already. + * Guarantee the si->users are checked before accessing other + * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is + * up to dated. + * + * Paired with the spin_unlock() after setup_swap_info() in + * enable_swap_info(), and smp_wmb() in swapoff. */ - if (!scanned_many) { - unsigned long scan_limit; - - if (offset < scan_base) - scan_limit = scan_base; - else - scan_limit = si->highest_bit; - for (; offset <= scan_limit && --latency_ration > 0; - offset++) { - if (!si->swap_map[offset]) - goto checks; - } - } - -done: - if (order == 0) - set_cluster_next(si, offset + 1); - si->flags -= SWP_SCANNING; - return n_ret; - -scan: - VM_WARN_ON(order > 0); - spin_unlock(&si->lock); - while (++offset <= READ_ONCE(si->highest_bit)) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - } - offset = si->lowest_bit; - while (offset < scan_base) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - offset++; - } - spin_lock(&si->lock); - -no_page: - si->flags -= SWP_SCANNING; - return n_ret; + smp_rmb(); + return true; } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) @@ -1291,32 +1258,15 @@ start_over: /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); - spin_lock(&si->lock); - if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_lists[node])) { - spin_unlock(&si->lock); - goto nextsi; - } - WARN(!si->highest_bit, - "swap_info %d in list but !highest_bit\n", - si->type); - WARN(!(si->flags & SWP_WRITEOK), - "swap_info %d in list but !SWP_WRITEOK\n", - si->type); - __del_from_avail_list(si); - spin_unlock(&si->lock); - goto nextsi; + if (get_swap_device_info(si)) { + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries, order); + put_swap_device(si); + if (n_ret || size > 1) + goto check_out; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries, order); - spin_unlock(&si->lock); - if (n_ret || size > 1) - goto check_out; - cond_resched(); spin_lock(&swap_avail_lock); -nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, @@ -1376,22 +1326,6 @@ out: return NULL; } -static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, - struct swap_info_struct *q) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - - if (p != q) { - if (q != NULL) - spin_unlock(&q->lock); - if (p != NULL) - spin_lock(&p->lock); - } - return p; -} - static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) @@ -1481,16 +1415,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) si = swp_swap_info(entry); if (!si) goto bad_nofile; - if (!percpu_ref_tryget_live(&si->users)) + if (!get_swap_device_info(si)) goto out; - /* - * Guarantee the si->users are checked before accessing other - * fields of swap_info_struct. - * - * Paired with the spin_unlock() after setup_swap_info() in - * enable_swap_info(). - */ - smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) goto put_out; @@ -1513,11 +1439,11 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); - unlock_cluster_or_swap_info(si, ci); if (!usage) - free_swap_slot(entry); + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + unlock_cluster(ci); return usage; } @@ -1538,22 +1464,17 @@ static bool __swap_entries_free(struct swap_info_struct *si, if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); + if (!has_cache) + swap_entry_range_free(si, ci, entry, nr); + unlock_cluster(ci); - if (!has_cache) { - for (i = 0; i < nr; i++) - zswap_invalidate(swp_entry(si->type, offset + i)); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, nr); - spin_unlock(&si->lock); - } return has_cache; fallback: @@ -1573,24 +1494,32 @@ fallback: * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages) +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; - struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); + /* It should never free entries across different clusters */ + VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(cluster_is_empty(ci)); + VM_BUG_ON(ci->count < nr_pages); + + ci->count -= nr_pages; do { VM_BUG_ON(*map != SWAP_HAS_CACHE); *map = 0; } while (++map < map_end); - dec_cluster_info_page(si, ci, nr_pages); - unlock_cluster(ci); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + + if (!ci->count) + free_cluster(si, ci); + else + partial_free_cluster(si, ci); } static void cluster_swap_free_nr(struct swap_info_struct *si, @@ -1598,29 +1527,14 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned char usage) { struct swap_cluster_info *ci; - DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; - int i, nr; + unsigned long end = offset + nr_pages; - ci = lock_cluster_or_swap_info(si, offset); - while (nr_pages) { - nr = min(BITS_PER_LONG, nr_pages); - for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(si, offset + i, usage)) - bitmap_set(to_free, i, 1); - } - if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(si, ci); - for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(si->type, offset + i)); - if (nr == nr_pages) - return; - bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(si, offset); - } - offset += nr; - nr_pages -= nr; - } - unlock_cluster_or_swap_info(si, ci); + ci = lock_cluster(si, offset); + do { + if (!__swap_entry_free_locked(si, offset, usage)) + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + } while (++offset < end); + unlock_cluster(ci); } /* @@ -1659,59 +1573,35 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster_or_swap_info(si, offset); - if (size > 1 && swap_is_has_cache(si, offset, size)) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - swap_entry_range_free(si, entry, size); - spin_unlock(&si->lock); - return; - } - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { - unlock_cluster_or_swap_info(si, ci); - free_swap_slot(entry); - if (i == size - 1) - return; - lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); + if (swap_is_has_cache(si, offset, size)) + swap_entry_range_free(si, ci, entry, size); + else { + for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + swap_entry_range_free(si, ci, entry, 1); } } - unlock_cluster_or_swap_info(si, ci); -} - -static int swp_entry_cmp(const void *ent1, const void *ent2) -{ - const swp_entry_t *e1 = ent1, *e2 = ent2; - - return (int)swp_type(*e1) - (int)swp_type(*e2); + unlock_cluster(ci); } void swapcache_free_entries(swp_entry_t *entries, int n) { - struct swap_info_struct *p, *prev; int i; + struct swap_cluster_info *ci; + struct swap_info_struct *si = NULL; if (n <= 0) return; - prev = NULL; - p = NULL; - - /* - * Sort swap entries by swap device, so each lock is only taken once. - * nr_swapfiles isn't absolutely correct, but the overhead of sort() is - * so low that it isn't necessary to optimize further. - */ - if (nr_swapfiles > 1) - sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { - p = swap_info_get_cont(entries[i], prev); - if (p) - swap_entry_range_free(p, entries[i], 1); - prev = p; + si = _swap_info_get(entries[i]); + if (si) { + ci = lock_cluster(si, swp_offset(entries[i])); + swap_entry_range_free(si, ci, entries[i], 1); + unlock_cluster(ci); + } } - if (p) - spin_unlock(&p->lock); } int __swap_count(swp_entry_t entry) @@ -1733,9 +1623,9 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1758,7 +1648,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1781,7 +1671,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1796,8 +1686,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster_or_swap_info(si, offset); - if (!ci || nr_pages == 1) { + ci = lock_cluster(si, offset); + if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; @@ -1809,7 +1699,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return ret; } @@ -1963,10 +1853,11 @@ swp_entry_t get_swap_page_of_type(int type) goto fail; /* This is called for allocating swap entry, not cache */ - spin_lock(&si->lock); - if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) - atomic_long_dec(&nr_swap_pages); - spin_unlock(&si->lock); + if (get_swap_device_info(si)) { + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) + atomic_long_dec(&nr_swap_pages); + put_swap_device(si); + } fail: return entry; } @@ -2057,7 +1948,7 @@ unsigned int count_swap_pages(int type, int free) if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) - n -= sis->inuse_pages; + n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } @@ -2392,7 +2283,7 @@ static int try_to_unuse(unsigned int type) swp_entry_t entry; unsigned int i; - if (!READ_ONCE(si->inuse_pages)) + if (!swap_usage_in_pages(si)) goto success; retry: @@ -2405,7 +2296,7 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { @@ -2433,7 +2324,7 @@ retry: mmput(prev_mm); i = 0; - while (READ_ONCE(si->inuse_pages) && + while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { @@ -2468,7 +2359,7 @@ retry: * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ - if (READ_ONCE(si->inuse_pages)) { + if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; @@ -2495,7 +2386,7 @@ static void drain_mmlist(void) unsigned int type; for (type = 0; type < nr_swapfiles; type++) - if (swap_info[type]->inuse_pages) + if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -2674,7 +2565,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio, static void _enable_swap_info(struct swap_info_struct *si) { - si->flags |= SWP_WRITEOK; atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; @@ -2691,9 +2581,8 @@ static void _enable_swap_info(struct swap_info_struct *si) */ plist_add(&si->list, &swap_active_head); - /* add to available list iff swap device is not full */ - if (si->highest_bit) - add_to_avail_list(si); + /* Add back to available list */ + add_to_avail_list(si, true); } static void enable_swap_info(struct swap_info_struct *si, int prio, @@ -2742,6 +2631,24 @@ bool has_usable_swap(void) return ret; } +/* + * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range + * see the updated flags, so there will be no more allocations. + */ +static void wait_for_allocation(struct swap_info_struct *si) +{ + unsigned long offset; + unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); + struct swap_cluster_info *ci; + + BUG_ON(si->flags & SWP_WRITEOK); + + for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { + ci = lock_cluster(si, offset); + unlock_cluster(ci); + } +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -2791,7 +2698,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } spin_lock(&p->lock); - del_from_avail_list(p); + del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; @@ -2809,10 +2716,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; - p->flags &= ~SWP_WRITEOK; spin_unlock(&p->lock); spin_unlock(&swap_lock); + wait_for_allocation(p); + disable_swap_slots_cache_lock(); set_current_oom_origin(); @@ -2855,16 +2763,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_lock(&p->lock); drain_mmlist(); - /* wait for anyone still in scan_swap_map_slots */ - p->highest_bit = 0; /* cuts scans short */ - while (p->flags >= SWP_SCANNING) { - spin_unlock(&p->lock); - spin_unlock(&swap_lock); - schedule_timeout_uninterruptible(1); - spin_lock(&swap_lock); - spin_lock(&p->lock); - } - swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; @@ -2881,8 +2779,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; + kfree(p->global_cluster); + p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); @@ -2992,7 +2890,7 @@ static int swap_show(struct seq_file *swap, void *v) } bytes = K(si->pages); - inuse = K(READ_ONCE(si->inuse_pages)); + inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -3109,6 +3007,7 @@ static struct swap_info_struct *alloc_swap_info(void) } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); + atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; @@ -3193,10 +3092,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return 0; } - si->lowest_bit = 1; - si->cluster_next = 1; - si->cluster_nr = 0; - maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { @@ -3213,7 +3108,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } - si->highest_bit = maxpages - 1; if (!maxpages) return 0; @@ -3281,7 +3175,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; @@ -3293,25 +3186,27 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - si->cluster_next_cpu = alloc_percpu(unsigned int); - if (!si->cluster_next_cpu) - goto err_free; - - /* Random start position to help with wear leveling */ - for_each_possible_cpu(cpu) - per_cpu(*si->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, si->highest_bit); + if (si->flags & SWP_SOLIDSTATE) { + si->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!si->percpu_cluster) + goto err_free; - si->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!si->percpu_cluster) - goto err_free; + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(si->percpu_cluster, cpu); + cluster = per_cpu_ptr(si->percpu_cluster, cpu); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_ENTRY_INVALID; + local_lock_init(&cluster->lock); + } + } else { + si->global_cluster = kmalloc(sizeof(*si->global_cluster), + GFP_KERNEL); + if (!si->global_cluster) + goto err_free; for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_NEXT_INVALID; + si->global_cluster->next[i] = SWAP_ENTRY_INVALID; + spin_lock_init(&si->global_cluster_lock); } /* @@ -3335,7 +3230,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - si->frag_cluster_nr[i] = 0; + atomic_long_set(&si->frag_cluster_nr[i], 0); } /* @@ -3343,7 +3238,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { - j = (k + col) % SWAP_CLUSTER_COLS; + j = k % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; @@ -3493,18 +3388,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; - - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; + goto bad_swap_unlock_inode; + } + if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3585,8 +3480,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; - free_percpu(si->cluster_next_cpu); - si->cluster_next_cpu = NULL; + kfree(si->global_cluster); + si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); @@ -3623,7 +3518,7 @@ void si_swapinfo(struct sysinfo *val) struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) - nr_to_be_unused += READ_ONCE(si->inuse_pages); + nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -3651,11 +3546,15 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) int err, i; si = swp_swap_info(entry); + if (WARN_ON_ONCE(!si)) { + pr_err("%s%08lx\n", Bad_file, entry.val); + return -EINVAL; + } offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3710,7 +3609,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return err; } @@ -3819,7 +3718,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ goto outer; } - spin_lock(&si->lock); offset = swp_offset(entry); @@ -3884,7 +3782,6 @@ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); - spin_unlock(&si->lock); put_swap_device(si); outer: if (page) diff --git a/mm/truncate.c b/mm/truncate.c index 7c304d2f0052..76d8fcd89bd0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -525,6 +525,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } EXPORT_SYMBOL(invalidate_mapping_pages); +static int folio_launder(struct address_space *mapping, struct folio *folio) +{ + if (!folio_test_dirty(folio)) + return 0; + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) + return 0; + return mapping->a_ops->launder_folio(folio); +} + /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger @@ -532,14 +541,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ -static int invalidate_complete_folio2(struct address_space *mapping, - struct folio *folio) +int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, + gfp_t gfp) { - if (folio->mapping != mapping) - return 0; + int ret; - if (!filemap_release_folio(folio, GFP_KERNEL)) - return 0; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); + + ret = folio_launder(mapping, folio); + if (ret) + return ret; + if (folio->mapping != mapping) + return -EBUSY; + if (!filemap_release_folio(folio, gfp)) + return -EBUSY; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); @@ -558,16 +577,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); - return 0; -} - -static int folio_launder(struct address_space *mapping, struct folio *folio) -{ - if (!folio_test_dirty(folio)) - return 0; - if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) - return 0; - return mapping->a_ops->launder_folio(folio); + return -EBUSY; } /** @@ -631,16 +641,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); - - if (folio_mapped(folio)) - unmap_mapping_folio(folio); - BUG_ON(folio_mapped(folio)); - - ret2 = folio_launder(mapping, folio); - if (ret2 == 0) { - if (!invalidate_complete_folio2(mapping, folio)) - ret2 = -EBUSY; - } + ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL); if (ret2 < 0) ret = ret2; folio_unlock(folio); diff --git a/mm/usercopy.c b/mm/usercopy.c index 83c164aba6e0..dbdcc43964fb 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -17,7 +17,7 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> @@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } -static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); +DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); +EXPORT_SYMBOL(validate_usercopy_range); /* * Validates that the given object is: @@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { - if (static_branch_unlikely(&bypass_usercopy_checks)) - return; - /* Skip all tests if size is zero. */ if (!n) return; @@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) } EXPORT_SYMBOL(__check_object_size); -static bool enable_checks __initdata = true; +static bool enable_checks __initdata = + IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON); static int __init parse_hardened_usercopy(char *str) { @@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { - if (enable_checks == false) - static_branch_enable(&bypass_usercopy_checks); + if (enable_checks) + static_branch_enable(&validate_usercopy_range); + else + static_branch_disable(&validate_usercopy_range); return 1; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 60a0be33766f..d06453fa8aba 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" +#include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) @@ -1020,6 +1021,14 @@ void double_pt_unlock(spinlock_t *ptl1, __release(ptl2); } +static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval) +{ + return pte_same(ptep_get(src_pte), orig_src_pte) && + pte_same(ptep_get(dst_pte), orig_dst_pte) && + pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); +} static int move_present_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1027,6 +1036,7 @@ static int move_present_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio *src_folio) { @@ -1034,8 +1044,8 @@ static int move_present_pte(struct mm_struct *mm, double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { err = -EAGAIN; goto out; } @@ -1067,23 +1077,32 @@ out: return err; } -static int move_swap_pte(struct mm_struct *mm, +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, - spinlock_t *dst_ptl, spinlock_t *src_ptl) + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio) { - if (!pte_swp_exclusive(orig_src_pte)) - return -EBUSY; - double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); @@ -1097,13 +1116,14 @@ static int move_zeropage_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1130,12 +1150,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, __u64 mode) { swp_entry_t entry; + struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; pmd_t dummy_pmdval; + pmd_t dst_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; @@ -1148,11 +1170,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, retry: /* * Use the maywrite version to indicate that dst_pte will be modified, - * but since we will use pte_same() to detect the change of the pte - * entry, there is no need to get pmdval, so just pass a dummy variable - * to it. + * since dst_pte needs to be none, the subsequent pte_same() check + * cannot prevent the dst_pte page from being freed concurrently, so we + * also need to abtain dst_pmdval and recheck pmd_same() later. */ - dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, &dst_ptl); /* Retry if a huge pmd materialized from under us */ @@ -1161,7 +1183,11 @@ retry: goto out; } - /* same as dst_pte */ + /* + * Unlike dst_pte, the subsequent pte_same() check can ensure the + * stability of the src_pte page, so there is no need to get pmdval, + * just pass a dummy variable to it. + */ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, &src_ptl); @@ -1177,8 +1203,8 @@ retry: } /* Sanity checks before the operation */ - if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || - WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { + if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || + pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { err = -EINVAL; goto out; } @@ -1213,7 +1239,7 @@ retry: err = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + dst_pmd, dst_pmdval, dst_ptl, src_ptl); goto out; } @@ -1224,6 +1250,7 @@ retry: */ if (!src_folio) { struct folio *folio; + bool locked; /* * Pin the page while holding the lock to be sure the @@ -1243,14 +1270,28 @@ retry: goto out; } + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + err = -EAGAIN; + goto out; + } + folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); - if (!folio_trylock(src_folio)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); @@ -1266,8 +1307,8 @@ retry: /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; err = split_folio(src_folio); if (err) @@ -1292,8 +1333,8 @@ retry: goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ anon_vma_lock_write(src_anon_vma); @@ -1303,14 +1344,16 @@ retry: err = move_present_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl, src_folio); + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, src_folio); } else { + struct folio *folio = NULL; + entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { if (is_migration_entry(entry)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); err = -EAGAIN; @@ -1319,10 +1362,53 @@ retry: goto out; } - err = move_swap_pte(mm, dst_addr, src_addr, - dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + if (!pte_swp_exclusive(orig_src_pte)) { + err = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + err = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (!IS_ERR_OR_NULL(folio)) { + if (folio_test_large(folio)) { + err = -EBUSY; + folio_put(folio); + goto out; + } + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + } + err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio); } out: @@ -1339,6 +1425,8 @@ out: if (src_pte) pte_unmap(src_pte); mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); return err; } diff --git a/mm/util.c b/mm/util.c index 60aa40f612b8..e7d81371032b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -23,6 +23,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; @@ -582,6 +585,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } +/* + * Perform a userland memory mapping into the current process address space. See + * the comment for do_mmap() for more details on this operation in general. + * + * This differs from do_mmap() in that: + * + * a. An offset parameter is provided rather than pgoff, which is both checked + * for overflow and page alignment. + * b. mmap locking is performed on the caller's behalf. + * c. Userfaultfd unmap events and memory population are handled. + * + * This means that this function performs essentially the same work as if + * userland were invoking mmap (2). + * + * Returns either an error, or the address at which the requested mapping has + * been performed. + */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) @@ -595,168 +615,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); -static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) -{ - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - flags |= __GFP_NOWARN; - - if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - flags &= ~__GFP_NOFAIL; - } - - return flags; -} - -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - void *ret; - - /* - * It doesn't really make sense to fallback to vmalloc for sub page - * requests - */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); - if (ret || size <= PAGE_SIZE) - return ret; - - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - - /* Don't even allow crazy sizes */ - if (unlikely(size > INT_MAX)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - - /* - * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, - * since the callers already cannot assume anything - * about the resulting pointer, and cannot play - * protection games. - */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); -} -EXPORT_SYMBOL(__kvmalloc_node_noprof); - -/** - * kvfree() - Free memory. - * @addr: Pointer to allocated memory. - * - * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). - * It is slightly more efficient to use kfree() or vfree() if you are certain - * that you know which one to use. - * - * Context: Either preemptible task context or not-NMI interrupt. - */ -void kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} -EXPORT_SYMBOL(kvfree); - -/** - * kvfree_sensitive - Free a data object containing sensitive information. - * @addr: address of the data object to be freed. - * @len: length of the data object. - * - * Use the special memzero_explicit() function to clear the content of a - * kvmalloc'ed object containing sensitive data to make sure that the - * compiler won't optimize out the data clearing. - */ -void kvfree_sensitive(const void *addr, size_t len) -{ - if (likely(!ZERO_OR_NULL_PTR(addr))) { - memzero_explicit((void *)addr, len); - kvfree(addr); - } -} -EXPORT_SYMBOL(kvfree_sensitive); - -/** - * kvrealloc - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @flags: the flags for the page level allocator - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) -{ - void *n; - - if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); - - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); - if (!n) { - /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); - if (!n) - return NULL; - - if (p) { - /* We already know that `p` is not a vmalloc address. */ - kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); - kasan_enable_current(); - - kfree(p); - } - } - - return n; -} -EXPORT_SYMBOL(kvrealloc_noprof); - /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. @@ -203,6 +203,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, } /* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +static void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +/* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ @@ -398,7 +430,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, @@ -415,8 +446,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static __must_check int +__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -511,38 +543,6 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, } /* - * vma has some anon_vma assigned, and is already inserted on that - * anon_vma's interval trees. - * - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the - * vma must be removed from the anon_vma's interval trees using - * anon_vma_interval_tree_pre_update_vma(). - * - * After the update, the vma will be reinserted using - * anon_vma_interval_tree_post_update_vma(). - * - * The entire update must be protected by exclusive mmap_lock and by - * the root anon_vma's mutex. - */ -void -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); -} - -void -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); -} - -/* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA * @src: The source VMA @@ -710,7 +710,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). */ -static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +static __must_check struct vm_area_struct *vma_merge_existing_range( + struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *prev = vmg->prev; @@ -728,19 +729,20 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * bool expanded; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ - VM_WARN_ON(vmg->next); /* We set this. */ - VM_WARN_ON(prev && start <= prev->vm_start); - VM_WARN_ON(start >= end); + VM_WARN_ON_VMG(!vma, vmg); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ + VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); + VM_WARN_ON_VMG(start >= end, vmg); + /* * If vma == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ - VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || - vmg->end > vma->vm_end)); + VM_WARN_ON_VMG(vma && ((vma != prev && vmg->start != vma->vm_start) || + vmg->end > vma->vm_end), vmg); /* The vmi must be positioned within vmg->vma. */ - VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && - vma_iter_addr(vmg->vmi) < vma->vm_end)); + VM_WARN_ON_VMG(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && + vma_iter_addr(vmg->vmi) < vma->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -857,9 +859,9 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct * pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); - VM_WARN_ON(!merge_right); + VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be vma. */ - VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + VM_WARN_ON_VMG(vmg->start > vma->vm_start && prev && vma != prev, vmg); if (merge_will_delete_vma) { vmg->vma = next; @@ -971,9 +973,9 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); - VM_WARN_ON(vmg->vma); + VM_WARN_ON_VMG(vmg->vma, vmg); /* vmi must point at or before the gap. */ - VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -1055,7 +1057,7 @@ int vma_expand(struct vma_merge_struct *vmg) remove_next = true; /* This should already have been checked by this point. */ - VM_WARN_ON(!can_merge_remove_vma(next)); + VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); ret = dup_anon_vma(vma, next, &anon_dup); if (ret) @@ -1063,10 +1065,10 @@ int vma_expand(struct vma_merge_struct *vmg) } /* Not merging but overwriting any part of next is not handled. */ - VM_WARN_ON(next && !remove_next && - next != vma && vmg->end > next->vm_start); + VM_WARN_ON_VMG(next && !remove_next && + next != vma && vmg->end > next->vm_start, vmg); /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); + VM_WARN_ON_VMG(vma->vm_start < vmg->start || vma->vm_end > vmg->end, vmg); if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) goto nomem; @@ -1130,7 +1132,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); - lru_add_drain(); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, @@ -1508,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; + unsigned long start = vmg->start; + unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; + if (vmg_nomem(vmg)) + return ERR_PTR(-ENOMEM); /* Split any preceding portion of the VMA. */ - if (vma->vm_start < vmg->start) { - int err = split_vma(vmg->vmi, vma, vmg->start, 1); + if (vma->vm_start < start) { + int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ - if (vma->vm_end > vmg->end) { - int err = split_vma(vmg->vmi, vma, vmg->end, 0); + if (vma->vm_end > end) { + int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); @@ -2376,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, map->flags); + if (!vma_is_anonymous(vma)) + khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; @@ -2430,7 +2436,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -unsigned long __mmap_region(struct file *file, unsigned long addr, +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { @@ -2481,3 +2487,476 @@ abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } + +/** + * mmap_region() - Actually perform the userland mapping of a VMA into + * current->mm with known, aligned and overflow-checked @addr and @len, and + * correctly determined VMA flags @vm_flags and page offset @pgoff. + * + * This is an internal memory management function, and should not be used + * directly. + * + * The caller must write-lock current->mm->mmap_lock. + * + * @file: If a file-backed mapping, a pointer to the struct file describing the + * file to be mapped, otherwise NULL. + * @addr: The page-aligned address at which to perform the mapping. + * @len: The page-aligned, non-zero, length of the mapping. + * @vm_flags: The VMA flags which should be applied to the mapping. + * @pgoff: If @file is specified, the page offset into the file, if not then + * the virtual page offset in memory of the anonymous mapping. + * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap + * events. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. + */ +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + mmap_assert_write_locked(current->mm); + + /* Check to see if MDWE is applicable. */ + if (map_deny_write_exec(vm_flags, vm_flags)) + return -EACCES; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @vmi: The vma iterator + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + /* + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. + */ + if (vma && vma->vm_end == addr) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + + vmg.prev = vma; + /* vmi is positioned at prev, which this mode expects. */ + vmg.merge_flags = VMG_FLAG_JUST_EXPAND; + + if (vma_merge_new_range(&vmg)) + goto out; + else if (vmg_nomem(&vmg)) + goto unacct_fail; + } + + if (vma) + vma_iter_next_range(vmi); + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto unacct_fail; + + vma_set_anonymous(vma); + vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); + vm_flags_init(vma, flags); + vma->vm_page_prot = vm_get_page_prot(flags); + vma_start_write(vma); + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; + validate_mm(mm); + ksm_add_vma(vma); +out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vm_flags_set(vma, VM_SOFTDIRTY); + return 0; + +mas_store_fail: + vm_area_free(vma); +unacct_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; +} + +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + /* + * Adjust for the gap first so it doesn't interfere with the + * later alignment. The first step is the minimum needed to + * fulill the start gap, the next steps is the minimum to align + * that. It is the minimum needed to fulill both. + */ + gap = vma_iter_addr(&vmi) + info->start_gap; + gap += (info->align_offset - gap) & info->align_mask; + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap, gap_end; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; + VMA_ITERATOR(vmi, current->mm, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask + info->start_gap; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +retry: + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) + return -ENOMEM; + + gap = vma_iter_end(&vmi) - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + gap_end = vma_iter_end(&vmi); + tmp = vma_next(&vmi); + if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap_end) { + high_limit = vm_start_gap(tmp); + vma_iter_reset(&vmi); + goto retry; + } + } else { + tmp = vma_prev(&vmi); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + vma_iter_reset(&vmi); + goto retry; + } + } + + return gap; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long new_start; + + /* address space limit tests */ + if (!may_expand_vm(mm, vma->vm_flags, grow)) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlimit(RLIMIT_STACK)) + return -ENOMEM; + + /* mlock limit tests */ + if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) + return -ENOMEM; + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory_mm(mm, grow)) + return -ENOMEM; + + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) +/* + * PA-RISC uses this for its stack. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= (TASK_SIZE & PAGE_MASK)) + return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + + if (next) + vma_iter_prev_range_limit(&vmi, address); + + vma_iter_config(&vmi, vma->vm_start, address); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_iter_store(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} +#endif /* CONFIG_STACK_GROWSUP */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. + */ +int expand_downwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + int error = 0; + VMA_ITERATOR(vmi, mm, vma->vm_start); + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + mmap_assert_write_locked(mm); + + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ + prev = vma_prev(&vmi); + /* Check that both stack segments have the same anon_vma? */ + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; + } + + if (prev) + vma_iter_next_range_limit(&vmi, vma->vm_start); + + vma_iter_config(&vmi, address, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + vma_iter_free(&vmi); + return -ENOMEM; + } + + /* Lock the VMA before expanding to prevent concurrent page faults */ + vma_start_write(vma); + /* We update the anon VMA tree. */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_start = address; + vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_iter_store(&vmi, vma); + anon_vma_interval_tree_post_update_vma(vma); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + vma_iter_free(&vmi); + validate_mm(mm); + return error; +} + +int __vm_munmap(unsigned long start, size_t len, bool unlock) +{ + int ret; + struct mm_struct *mm = current->mm; + LIST_HEAD(uf); + VMA_ITERATOR(vmi, mm, start); + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); + if (ret || !unlock) + mmap_write_unlock(mm); + + userfaultfd_unmap_complete(mm, &uf); + return ret; +} @@ -139,15 +139,10 @@ void validate_mm(struct mm_struct *mm); #define validate_mm(mm) do { } while (0) #endif -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); - -/* Required for expand_downwards(). */ -void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); - -int vma_expand(struct vma_merge_struct *vmg); -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); +__must_check int vma_expand(struct vma_merge_struct *vmg); +__must_check int vma_shrink(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) @@ -180,13 +175,14 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, +__must_check struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -196,7 +192,7 @@ struct vm_area_struct struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -204,7 +200,7 @@ struct vm_area_struct struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -212,11 +208,13 @@ struct vm_area_struct unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); -struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct +*vma_merge_new_range(struct vma_merge_struct *vmg); -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct +*vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); @@ -243,10 +241,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); -unsigned long __mmap_region(struct file *file, unsigned long addr, +unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); +int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); + +unsigned long unmapped_area(struct vm_unmapped_area_info *info); +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* @@ -472,4 +476,12 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) #endif +#if defined(CONFIG_STACK_GROWSUP) +int expand_upwards(struct vm_area_struct *vma, unsigned long address); +#endif + +int expand_downwards(struct vm_area_struct *vma, unsigned long address); + +int __vm_munmap(unsigned long start, size_t len, bool unlock); + #endif /* __MM_VMA_H */ diff --git a/mm/vma_internal.h b/mm/vma_internal.h index fc5f172a36bd..2f05735ff190 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -35,6 +35,7 @@ #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/perf_event.h> +#include <linux/personality.h> #include <linux/pfn.h> #include <linux/rcupdate.h> #include <linux/rmap.h> diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c88d0e90c20..61981ee1c9d2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, mask |= PGTBL_PGD_MODIFIED; err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) - return err; + break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, end); - return 0; + return err; } /* @@ -3562,11 +3562,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, + nr = alloc_pages_bulk_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node_noprof(gfp, nid, + nr = alloc_pages_bulk_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); diff --git a/mm/vmscan.c b/mm/vmscan.c index b1ec5ece067e..c767d71c43d7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -862,6 +862,31 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { @@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1053,7 +1086,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, struct folio_batch free_folios; LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); - unsigned int nr_reclaimed = 0; + unsigned int nr_reclaimed = 0, nr_demoted = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; @@ -1092,11 +1125,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -1522,8 +1550,9 @@ keep: /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); - nr_reclaimed += stat->nr_demoted; + nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += nr_demoted; + stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -1664,6 +1693,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; + unsigned long max_nr_skipped = 0; LIST_HEAD(folios_skipped); total_scan = 0; @@ -1678,9 +1708,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx) { + /* Using max_nr_skipped to prevent hard LOCKUP*/ + if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED && + (folio_zonenum(folio) > sc->reclaim_idx)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; + max_nr_skipped++; goto move; } @@ -2619,11 +2652,17 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS]) + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define for_each_evictable_type(type, swappiness) \ + for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++) + #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2669,10 +2708,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; } /****************************************************************************** @@ -3073,16 +3118,20 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { + int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); - pos->refaulted = lrugen->avg_refaulted[type][tier] + - atomic_long_read(&lrugen->refaulted[hist][type][tier]); - pos->total = lrugen->avg_total[type][tier] + - atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) @@ -3108,17 +3157,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); - if (tier) - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } @@ -3145,16 +3192,19 @@ static int folio_update_gen(struct folio *folio, int gen) VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; @@ -3178,7 +3228,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) @@ -3253,7 +3303,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->can_swap; + return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3263,7 +3313,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (shmem_mapping(mapping)) - return !walk->can_swap; + return !walk->swappiness; + + if (walk->swappiness > MAX_SWAPPINESS) + return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; @@ -3351,19 +3404,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - struct pglist_data *pgdat, bool can_swap) + struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); - folio = pfn_folio(pfn); - if (folio_nid(folio) != pgdat->node_id) + if (folio_lru_gen(folio) < 0) return NULL; - if (folio_memcg(folio) != memcg) + if (folio_nid(folio) != pgdat->node_id) return NULL; - /* file VMAs can contain anon pages from COW */ - if (!folio_is_file_lru(folio) && !can_swap) + if (folio_memcg(folio) != memcg) return NULL; return folio; @@ -3377,29 +3428,55 @@ static bool suitable_to_scan(int total, int young) return young * n >= total; } +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; + bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); pmd_t pmdval; - pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, - &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; + if (!spin_trylock(ptl)) { pte_unmap(pte); - return false; + return true; } if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { @@ -3421,26 +3498,30 @@ restart: if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); + + last = folio; + dirty = false; + } - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + if (pte_dirty(ptent)) + dirty = true; - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; } + walk_update_folio(walk, last, gen, dirty); + last = NULL; + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; @@ -3454,13 +3535,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; + bool dirty; pmd_t *pmd; spinlock_t *ptl; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3506,27 +3589,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (pfn == -1) goto next; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + last = folio; + dirty = false; + } - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (pmd_dirty(pmd[i])) + dirty = true; + + walk->mm_stats[MM_LEAF_YOUNG]++; next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: @@ -3718,22 +3804,25 @@ static void clear_mm_walk(void) kfree(walk); } -static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type == LRU_GEN_ANON && !can_swap) + if (type ? swappiness > MAX_SWAPPINESS : !swappiness) goto done; - /* prevent cold/hot inversion if force_scan is true */ + /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3743,6 +3832,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } + if (!--remaining) return false; } @@ -3754,7 +3852,7 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; @@ -3764,7 +3862,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); @@ -3780,13 +3878,17 @@ next: } /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + if (swappiness && swappiness <= MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; } - for (type = !can_swap; type < ANON_AND_FILE; type++) { - if (min_seq[type] == lrugen->min_seq[type]) + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); @@ -3797,8 +3899,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { bool success; int prev, next; @@ -3816,13 +3917,11 @@ restart: if (!success) goto unlock; - for (type = ANON_AND_FILE - 1; type >= 0; type--) { + for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; - VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - - if (inc_min_seq(lruvec, type, can_swap)) + if (inc_min_seq(lruvec, type, swappiness)) continue; spin_unlock_irq(&lruvec->lru_lock); @@ -3866,7 +3965,7 @@ unlock: } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -3877,7 +3976,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) @@ -3902,7 +4001,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, walk->lruvec = lruvec; walk->seq = seq; - walk->can_swap = can_swap; + walk->swappiness = swappiness; walk->force_scan = force_scan; do { @@ -3912,7 +4011,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, swappiness); WARN_ON_ONCE(!success); } @@ -3953,13 +4052,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; - bool can_swap = get_swappiness(lruvec, sc); + int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { @@ -3979,6 +4078,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc { int gen; unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); @@ -3988,8 +4088,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc if (!lruvec_is_sizable(lruvec, sc)) return false; - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); return time_is_before_jiffies(birth + min_ttl); @@ -4048,21 +4147,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; + bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; + struct folio *last = NULL; int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4109,37 +4209,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; if (!ptep_clear_young_notify(vma, addr, pte + i)) continue; - young++; - - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - if (walk) { - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - continue; + last = folio; + dirty = false; } - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) { - folio_clear_lru_refs(folio); - folio_activate(folio); - } + if (pte_dirty(ptent)) + dirty = true; + + young++; } + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); /* feedback from rmap walkers to page table walkers */ @@ -4297,7 +4388,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4319,14 +4411,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } return true; } @@ -4346,8 +4441,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* waiting for writeback */ - if (folio_test_locked(folio) || writeback || - (type == LRU_GEN_FILE && dirty)) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4376,13 +4470,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - folio_clear_lru_refs(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4478,13 +4571,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type) struct ctrl_pos sp, pv; /* - * To leave a margin for fluctuations, use a larger gain factor (1:2). + * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ - read_ctrl_pos(lruvec, type, 0, 1, &sp); + read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, 2, &pv); + read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } @@ -4492,79 +4585,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type) return tier - 1; } -static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness }; + if (swappiness <= MIN_SWAPPINESS + 1) + return LRU_GEN_FILE; + + if (swappiness >= MAX_SWAPPINESS) + return LRU_GEN_ANON; /* - * Compare the first tier of anon with that of file to determine which - * type to scan. Also need to compare other tiers of the selected type - * with the first tier of the other type to determine the last tier (of - * the selected type) to evict. + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. */ - read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); - read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); - type = positive_ctrl_err(&sp, &pv); - - read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); - for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, gain[type], &pv); - if (!positive_ctrl_err(&sp, &pv)) - break; - } - - *tier_idx = tier - 1; + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); - return type; + return positive_ctrl_err(&sp, &pv); } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; - int type; - int scanned; - int tier = -1; - DEFINE_MIN_SEQ(lruvec); + int type = get_type_to_scan(lruvec, swappiness); - /* - * Try to make the obvious choice first, and if anon and file are both - * available from the same generation, - * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon - * first. - * 2. If !__GFP_IO, file first since clean pagecache is more likely to - * exist than clean swapcache. - */ - if (!swappiness) - type = LRU_GEN_FILE; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) - type = LRU_GEN_FILE; - else if (swappiness == MAX_SWAPPINESS) - type = LRU_GEN_ANON; - else if (!(sc->gfp_mask & __GFP_IO)) - type = LRU_GEN_FILE; - else - type = get_type_to_scan(lruvec, swappiness, &tier); + for_each_evictable_type(i, swappiness) { + int scanned; + int tier = get_tier_idx(lruvec, type); - for (i = !swappiness; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); + *type_scanned = type; scanned = scan_folios(lruvec, sc, type, tier, list); if (scanned) - break; + return scanned; type = !type; - tier = -1; } - *type_scanned = type; - - return scanned; + return 0; } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) @@ -4580,6 +4639,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -4589,7 +4649,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned += try_to_inc_min_seq(lruvec, swappiness); - if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); @@ -4605,31 +4665,24 @@ retry: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); continue; } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); @@ -4664,63 +4717,32 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - bool can_swap, unsigned long *nr_to_scan) + int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; + unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - } - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; } } - *nr_to_scan = total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* @@ -4728,7 +4750,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { bool success; unsigned long nr_to_scan; @@ -4738,7 +4760,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) @@ -4749,7 +4771,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5293,8 +5315,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) @@ -5349,7 +5370,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); if (!full) - seq = min_seq[LRU_GEN_ANON]; + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else @@ -5389,23 +5410,14 @@ static const struct seq_operations lru_gen_seq_ops = { }; static int run_aging(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - - if (seq < max_seq) - return 0; if (seq > max_seq) return -EINVAL; - if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) - return -ERANGE; - - try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); - - return 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, @@ -5421,7 +5433,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); - if (seq < min_seq[!swappiness]) + if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) @@ -5466,7 +5478,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > MAX_SWAPPINESS) + else if (swappiness > MAX_SWAPPINESS + 1) goto done; switch (cmd) { @@ -7192,10 +7204,6 @@ static int kswapd(void *p) unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); /* * Tell the memory management that we're a "memory allocator", @@ -7364,13 +7372,15 @@ void __meminit kswapd_run(int nid) pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ pr_err("Failed to start kswapd on node %d,ret=%ld\n", nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); pgdat->kswapd = NULL; + } else { + wake_up_process(pgdat->kswapd); } } pgdat_kswapd_unlock(pgdat); diff --git a/mm/vmstat.c b/mm/vmstat.c index 16bfe1c694dd..88998725f1c5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1435,6 +1435,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", + "direct_map_level2_collapses", + "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS "vma_lock_success", diff --git a/mm/workingset.c b/mm/workingset.c index a4705e196545..4841ae8af411 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; - unsigned long min_seq; + unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; @@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); - min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); - return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) @@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow) lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); - /* see the comment in folio_lru_refs() */ - refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; - tier = lru_tier_from_refs(refs); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - /* - * Count the following two cases as stalls: - * 1. For pages accessed through page tables, hotter pages pushed out - * hot pages which refaulted immediately. - * 2. For pages accessed multiple times through file descriptors, - * they would have been protected by sort_folio(). - */ - if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { - set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { + folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); - } + } else + set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } @@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; @@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, struct pglist_data *pgdat; unsigned long eviction; - rcu_read_lock(); - if (lru_gen_enabled()) { - bool recent = lru_gen_test_recent(shadow, file, - &eviction_lruvec, &eviction, workingset); + bool recent; + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } - + rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && - (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { - rcu_read_unlock(); - return false; - } - + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) + return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * @@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow) bool workingset; long nr; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; @@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow) * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); diff --git a/mm/zpdesc.h b/mm/zpdesc.h new file mode 100644 index 000000000000..fa47fece2237 --- /dev/null +++ b/mm/zpdesc.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* zpdesc.h: zswap.zpool memory descriptor + * + * Written by Alex Shi <alexs@kernel.org> + * Hyeonggon Yoo <42.hyeyoo@gmail.com> + */ +#ifndef __MM_ZPDESC_H__ +#define __MM_ZPDESC_H__ + +/* + * struct zpdesc - Memory descriptor for zpool memory. + * @flags: Page flags, mostly unused by zsmalloc. + * @lru: Indirectly used by page migration. + * @movable_ops: Used by page migration. + * @next: Next zpdesc in a zspage in zsmalloc zpool. + * @handle: For huge zspage in zsmalloc zpool. + * @zspage: Points to the zspage this zpdesc is a part of. + * @first_obj_offset: First object offset in zsmalloc zpool. + * @_refcount: The number of references to this zpdesc. + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. In particular, do not expand into the overlap + * with memcg_data. + * + * Page flags used: + * * PG_private identifies the first component page. + * * PG_locked is used by page migration code. + */ +struct zpdesc { + unsigned long flags; + struct list_head lru; + unsigned long movable_ops; + union { + struct zpdesc *next; + unsigned long handle; + }; + struct zspage *zspage; + /* + * Only the lower 24 bits are available for offset, limiting a page + * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc. + * + * Do not access this field directly. + * Instead, use {get,set}_first_obj_offset() helpers. + */ + unsigned int first_obj_offset; + atomic_t _refcount; +}; +#define ZPDESC_MATCH(pg, zp) \ + static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp)) + +ZPDESC_MATCH(flags, flags); +ZPDESC_MATCH(lru, lru); +ZPDESC_MATCH(mapping, movable_ops); +ZPDESC_MATCH(index, next); +ZPDESC_MATCH(index, handle); +ZPDESC_MATCH(private, zspage); +ZPDESC_MATCH(page_type, first_obj_offset); +ZPDESC_MATCH(_refcount, _refcount); +#undef ZPDESC_MATCH +static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); + +/* + * zpdesc_page - The first struct page allocated for a zpdesc + * @zp: The zpdesc. + * + * A convenience wrapper for converting zpdesc to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct zpdesc. + * + */ +#define zpdesc_page(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct page *)(zp), \ + struct zpdesc *: (struct page *)(zp))) + +/** + * zpdesc_folio - The folio allocated for a zpdesc + * @zp: The zpdesc. + * + * Zpdescs are descriptors for zpool memory. The zpool memory itself is + * allocated as folios that contain the zpool objects, and zpdesc uses specific + * fields in the first struct page of the folio - those fields are now accessed + * by struct zpdesc. + * + * It is occasionally necessary convert to back to a folio in order to + * communicate with the rest of the mm. Please use this helper function + * instead of casting yourself, as the implementation may change in the future. + */ +#define zpdesc_folio(zp) (_Generic((zp), \ + const struct zpdesc *: (const struct folio *)(zp), \ + struct zpdesc *: (struct folio *)(zp))) +/** + * page_zpdesc - Converts from first struct page to zpdesc. + * @p: The first (either head of compound or single) page of zpdesc. + * + * A temporary wrapper to convert struct page to struct zpdesc in situations + * where we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct zpdesc directly or go + * through folio to struct zpdesc. + * + * Return: The zpdesc which contains this page + */ +#define page_zpdesc(p) (_Generic((p), \ + const struct page *: (const struct zpdesc *)(p), \ + struct page *: (struct zpdesc *)(p))) + +static inline void zpdesc_lock(struct zpdesc *zpdesc) +{ + folio_lock(zpdesc_folio(zpdesc)); +} + +static inline bool zpdesc_trylock(struct zpdesc *zpdesc) +{ + return folio_trylock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_unlock(struct zpdesc *zpdesc) +{ + folio_unlock(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_wait_locked(struct zpdesc *zpdesc) +{ + folio_wait_locked(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_get(struct zpdesc *zpdesc) +{ + folio_get(zpdesc_folio(zpdesc)); +} + +static inline void zpdesc_put(struct zpdesc *zpdesc) +{ + folio_put(zpdesc_folio(zpdesc)); +} + +static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc) +{ + return kmap_local_page(zpdesc_page(zpdesc)); +} + +static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc) +{ + return page_to_pfn(zpdesc_page(zpdesc)); +} + +static inline struct zpdesc *pfn_zpdesc(unsigned long pfn) +{ + return page_zpdesc(pfn_to_page(pfn)); +} + +static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, + const struct movable_operations *mops) +{ + __SetPageMovable(zpdesc_page(zpdesc), mops); +} + +static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) +{ + __SetPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) +{ + __ClearPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) +{ + return PageIsolated(zpdesc_page(zpdesc)); +} + +static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) +{ + return page_zone(zpdesc_page(zpdesc)); +} + +static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) +{ + return folio_test_locked(zpdesc_folio(zpdesc)); +} +#endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 64b66a4d3e6e..6d0e47f7ae33 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -13,24 +13,6 @@ * Released under the terms of GNU General Public License Version 2.0 */ -/* - * Following is how we use various fields and flags of underlying - * struct page(s) to form a zspage. - * - * Usage of struct page fields: - * page->private: points to zspage - * page->index: links together all component pages of a zspage - * For the huge page, this is always 0, so we use this field - * to store handle. - * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object - * offset in a subpage of a zspage - * - * Usage of struct page flags: - * PG_private: identifies the first component page - * PG_owner_priv_1: identifies the huge component page - * - */ - #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* @@ -67,6 +49,7 @@ #include <linux/pagemap.h> #include <linux/fs.h> #include <linux/local_lock.h> +#include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 @@ -245,6 +228,35 @@ struct zs_pool { atomic_t compaction_in_progress; }; +static inline void zpdesc_set_first(struct zpdesc *zpdesc) +{ + SetPagePrivate(zpdesc_page(zpdesc)); +} + +static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc) +{ + inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) +{ + dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); +} + +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +{ + struct page *page = alloc_page(gfp); + + return page_zpdesc(page); +} + +static inline void free_zpdesc(struct zpdesc *zpdesc) +{ + struct page *page = zpdesc_page(zpdesc); + + __free_page(page); +} + struct zspage { struct { unsigned int huge:HUGE_BITS; @@ -254,7 +266,7 @@ struct zspage { }; unsigned int inuse; unsigned int freeobj; - struct page *first_page; + struct zpdesc *first_zpdesc; struct list_head list; /* fullness list */ struct zs_pool *pool; rwlock_t lock; @@ -440,9 +452,9 @@ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { .lock = INIT_LOCAL_LOCK(lock), }; -static __maybe_unused int is_first_page(struct page *page) +static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { - return PagePrivate(page); + return PagePrivate(zpdesc_page(zpdesc)); } /* Protected by class->lock */ @@ -451,36 +463,35 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } - static inline void mod_zspage_inuse(struct zspage *zspage, int val) { zspage->inuse += val; } -static inline struct page *get_first_page(struct zspage *zspage) +static struct zpdesc *get_first_zpdesc(struct zspage *zspage) { - struct page *first_page = zspage->first_page; + struct zpdesc *first_zpdesc = zspage->first_zpdesc; - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - return first_page; + VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc)); + return first_zpdesc; } #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff -static inline unsigned int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) { - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK; + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); + return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; } -static inline void set_first_obj_offset(struct page *page, unsigned int offset) +static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) { /* With 24 bits available, we can support offsets into 16 MiB pages. */ BUILD_BUG_ON(PAGE_SIZE > SZ_16M); - VM_WARN_ON_ONCE(!PageZsmalloc(page)); + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); - page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; - page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -733,52 +744,52 @@ out: return newfg; } -static struct zspage *get_zspage(struct page *page) +static struct zspage *get_zspage(struct zpdesc *zpdesc) { - struct zspage *zspage = (struct zspage *)page_private(page); + struct zspage *zspage = zpdesc->zspage; BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; } -static struct page *get_next_page(struct page *page) +static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) { - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) return NULL; - return (struct page *)page->index; + return zpdesc->next; } /** - * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value * @obj: the encoded object value - * @page: page object resides in zspage + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static void obj_to_location(unsigned long obj, struct page **page, +static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, unsigned int *obj_idx) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } -static void obj_to_page(unsigned long obj, struct page **page) +static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); } /** - * location_to_obj - get obj value encoded from (<page>, <obj_idx>) - * @page: page object resides in zspage + * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>) + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) { unsigned long obj; - obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; return obj; @@ -789,15 +800,15 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static inline bool obj_allocated(struct page *page, void *obj, +static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, unsigned long *phandle) { unsigned long handle; - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) { - VM_BUG_ON_PAGE(!is_first_page(page), page); - handle = page->index; + VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); + handle = zpdesc->handle; } else handle = *(unsigned long *)obj; @@ -809,22 +820,24 @@ static inline bool obj_allocated(struct page *page, void *obj, return true; } -static void reset_page(struct page *page) +static void reset_zpdesc(struct zpdesc *zpdesc) { + struct page *page = zpdesc_page(zpdesc); + __ClearPageMovable(page); ClearPagePrivate(page); - set_page_private(page, 0); - page->index = 0; + zpdesc->zspage = NULL; + zpdesc->next = NULL; __ClearPageZsmalloc(page); } static int trylock_zspage(struct zspage *zspage) { - struct page *cursor, *fail; + struct zpdesc *cursor, *fail; - for (cursor = get_first_page(zspage); cursor != NULL; cursor = - get_next_page(cursor)) { - if (!trylock_page(cursor)) { + for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor = + get_next_zpdesc(cursor)) { + if (!zpdesc_trylock(cursor)) { fail = cursor; goto unlock; } @@ -832,9 +845,9 @@ static int trylock_zspage(struct zspage *zspage) return 1; unlock: - for (cursor = get_first_page(zspage); cursor != fail; cursor = - get_next_page(cursor)) - unlock_page(cursor); + for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor = + get_next_zpdesc(cursor)) + zpdesc_unlock(cursor); return 0; } @@ -842,23 +855,23 @@ unlock: static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { - struct page *page, *next; + struct zpdesc *zpdesc, *next; assert_spin_locked(&class->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); - next = page = get_first_page(zspage); + next = zpdesc = get_first_zpdesc(zspage); do { - VM_BUG_ON_PAGE(!PageLocked(page), page); - next = get_next_page(page); - reset_page(page); - unlock_page(page); - dec_zone_page_state(page, NR_ZSPAGES); - put_page(page); - page = next; - } while (page != NULL); + VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); + next = get_next_zpdesc(zpdesc); + reset_zpdesc(zpdesc); + zpdesc_unlock(zpdesc); + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_put(zpdesc); + zpdesc = next; + } while (zpdesc != NULL); cache_free_zspage(pool, zspage); @@ -891,16 +904,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) { unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); - while (page) { - struct page *next_page; + while (zpdesc) { + struct zpdesc *next_zpdesc; struct link_free *link; void *vaddr; - set_first_obj_offset(page, off); + set_first_obj_offset(zpdesc, off); - vaddr = kmap_local_page(page); + vaddr = kmap_local_zpdesc(zpdesc); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { @@ -913,8 +926,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * page, which must point to the first object on the next * page (if present) */ - next_page = get_next_page(page); - if (next_page) { + next_zpdesc = get_next_zpdesc(zpdesc); + if (next_zpdesc) { link->next = freeobj++ << OBJ_TAG_BITS; } else { /* @@ -924,7 +937,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) link->next = -1UL << OBJ_TAG_BITS; } kunmap_local(vaddr); - page = next_page; + zpdesc = next_zpdesc; off %= PAGE_SIZE; } @@ -932,35 +945,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) } static void create_page_chain(struct size_class *class, struct zspage *zspage, - struct page *pages[]) + struct zpdesc *zpdescs[]) { int i; - struct page *page; - struct page *prev_page = NULL; - int nr_pages = class->pages_per_zspage; + struct zpdesc *zpdesc; + struct zpdesc *prev_zpdesc = NULL; + int nr_zpdescs = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. all pages are linked together using page->index - * 2. each sub-page point to zspage using page->private + * 1. all pages are linked together using zpdesc->next + * 2. each sub-page point to zspage using zpdesc->zspage * - * we set PG_private to identify the first page (i.e. no other sub-page + * we set PG_private to identify the first zpdesc (i.e. no other zpdesc * has this flag set). */ - for (i = 0; i < nr_pages; i++) { - page = pages[i]; - set_page_private(page, (unsigned long)zspage); - page->index = 0; + for (i = 0; i < nr_zpdescs; i++) { + zpdesc = zpdescs[i]; + zpdesc->zspage = zspage; + zpdesc->next = NULL; if (i == 0) { - zspage->first_page = page; - SetPagePrivate(page); + zspage->first_zpdesc = zpdesc; + zpdesc_set_first(zpdesc); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) SetZsHugePage(zspage); } else { - prev_page->index = (unsigned long)page; + prev_zpdesc->next = zpdesc; } - prev_page = page; + prev_zpdesc = zpdesc; } } @@ -972,7 +985,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, gfp_t gfp) { int i; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp); if (!zspage) @@ -982,25 +995,25 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, migrate_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; + struct zpdesc *zpdesc; - page = alloc_page(gfp); - if (!page) { + zpdesc = alloc_zpdesc(gfp); + if (!zpdesc) { while (--i >= 0) { - dec_zone_page_state(pages[i], NR_ZSPAGES); - __ClearPageZsmalloc(pages[i]); - __free_page(pages[i]); + zpdesc_dec_zone_page_state(zpdescs[i]); + __zpdesc_clear_zsmalloc(zpdescs[i]); + free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); return NULL; } - __SetPageZsmalloc(page); + __zpdesc_set_zsmalloc(zpdesc); - inc_zone_page_state(page, NR_ZSPAGES); - pages[i] = page; + zpdesc_inc_zone_page_state(zpdesc); + zpdescs[i] = zpdesc; } - create_page_chain(class, zspage, pages); + create_page_chain(class, zspage, zpdescs); init_zspage(class, zspage); zspage->pool = pool; zspage->class = class->index; @@ -1044,7 +1057,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) } static void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) + struct zpdesc *zpdescs[2], int off, int size) { size_t sizes[2]; char *buf = area->vm_buf; @@ -1060,14 +1073,14 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - memcpy_from_page(buf, pages[0], off, sizes[0]); - memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]); + memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]); + memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]); out: return area->vm_buf; } static void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) + struct zpdesc *zpdescs[2], int off, int size) { size_t sizes[2]; char *buf; @@ -1085,8 +1098,8 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - memcpy_to_page(pages[0], off, buf, sizes[0]); - memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]); + memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]); + memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]); out: /* enable page faults to match kunmap_local() return conditions */ @@ -1176,13 +1189,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; - struct page *pages[2]; + struct zpdesc *zpdescs[2]; void *ret; /* @@ -1195,8 +1208,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, /* It guarantees it can get zspage from handle safely */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); /* * migration cannot move any zpages in this zspage. Here, class->lock @@ -1215,17 +1228,17 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ - area->vm_addr = kmap_local_page(page); + area->vm_addr = kmap_local_zpdesc(zpdesc); ret = area->vm_addr + off; goto out; } /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + zpdescs[0] = zpdesc; + zpdescs[1] = get_next_zpdesc(zpdesc); + BUG_ON(!zpdescs[1]); - ret = __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, zpdescs, off, class->size); out: if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; @@ -1237,7 +1250,7 @@ EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *page; + struct zpdesc *zpdesc; unsigned long obj, off; unsigned int obj_idx; @@ -1245,8 +1258,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) struct mapping_area *area; obj = handle_to_obj(handle); - obj_to_location(obj, &page, &obj_idx); - zspage = get_zspage(page); + obj_to_location(obj, &zpdesc, &obj_idx); + zspage = get_zspage(zpdesc); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); @@ -1254,13 +1267,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) if (off + class->size <= PAGE_SIZE) kunmap_local(area->vm_addr); else { - struct page *pages[2]; + struct zpdesc *zpdescs[2]; - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); + zpdescs[0] = zpdesc; + zpdescs[1] = get_next_zpdesc(zpdesc); + BUG_ON(!zpdescs[1]); - __zs_unmap_object(area, pages, off, class->size); + __zs_unmap_object(area, zpdescs, off, class->size); } local_unlock(&zs_map_area.lock); @@ -1290,12 +1303,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size); static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { - int i, nr_page, offset; + int i, nr_zpdesc, offset; unsigned long obj; struct link_free *link; struct size_class *class; - struct page *m_page; + struct zpdesc *m_zpdesc; unsigned long m_offset; void *vaddr; @@ -1303,27 +1316,26 @@ static unsigned long obj_malloc(struct zs_pool *pool, obj = get_freeobj(zspage); offset = obj * class->size; - nr_page = offset >> PAGE_SHIFT; + nr_zpdesc = offset >> PAGE_SHIFT; m_offset = offset_in_page(offset); - m_page = get_first_page(zspage); + m_zpdesc = get_first_zpdesc(zspage); - for (i = 0; i < nr_page; i++) - m_page = get_next_page(m_page); + for (i = 0; i < nr_zpdesc; i++) + m_zpdesc = get_next_zpdesc(m_zpdesc); - vaddr = kmap_local_page(m_page); + vaddr = kmap_local_zpdesc(m_zpdesc); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle | OBJ_ALLOCATED_TAG; else - /* record handle to page->index */ - zspage->first_page->index = handle | OBJ_ALLOCATED_TAG; + zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG; kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); - obj = location_to_obj(m_page, obj); + obj = location_to_obj(m_zpdesc, obj); record_obj(handle, obj); return obj; @@ -1402,23 +1414,24 @@ static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long f_offset; unsigned int f_objidx; void *vaddr; - obj_to_location(obj, &f_page, &f_objidx); + + obj_to_location(obj, &f_zpdesc, &f_objidx); f_offset = offset_in_page(class_size * f_objidx); - zspage = get_zspage(f_page); + zspage = get_zspage(f_zpdesc); - vaddr = kmap_local_page(f_page); + vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ if (likely(!ZsHugePage(zspage))) link->next = get_freeobj(zspage) << OBJ_TAG_BITS; else - f_page->index = 0; + f_zpdesc->handle = 0; set_freeobj(zspage, f_objidx); kunmap_local(vaddr); @@ -1428,7 +1441,7 @@ static void obj_free(int class_size, unsigned long obj) void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long obj; struct size_class *class; int fullness; @@ -1442,8 +1455,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_page(obj, &f_page); - zspage = get_zspage(f_page); + obj_to_zpdesc(obj, &f_zpdesc); + zspage = get_zspage(f_zpdesc); class = zspage_class(pool, zspage); spin_lock(&class->lock); read_unlock(&pool->migrate_lock); @@ -1463,7 +1476,7 @@ EXPORT_SYMBOL_GPL(zs_free); static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { - struct page *s_page, *d_page; + struct zpdesc *s_zpdesc, *d_zpdesc; unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; @@ -1472,8 +1485,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, s_size = d_size = class->size; - obj_to_location(src, &s_page, &s_objidx); - obj_to_location(dst, &d_page, &d_objidx); + obj_to_location(src, &s_zpdesc, &s_objidx); + obj_to_location(dst, &d_zpdesc, &d_objidx); s_off = offset_in_page(class->size * s_objidx); d_off = offset_in_page(class->size * d_objidx); @@ -1484,8 +1497,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); while (1) { size = min(s_size, d_size); @@ -1510,17 +1523,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (s_off >= PAGE_SIZE) { kunmap_local(d_addr); kunmap_local(s_addr); - s_page = get_next_page(s_page); - s_addr = kmap_local_page(s_page); - d_addr = kmap_local_page(d_page); + s_zpdesc = get_next_zpdesc(s_zpdesc); + s_addr = kmap_local_zpdesc(s_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { kunmap_local(d_addr); - d_page = get_next_page(d_page); - d_addr = kmap_local_page(d_page); + d_zpdesc = get_next_zpdesc(d_zpdesc); + d_addr = kmap_local_zpdesc(d_zpdesc); d_size = class->size - written; d_off = 0; } @@ -1535,18 +1548,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) + struct zpdesc *zpdesc, int *obj_idx) { unsigned int offset; int index = *obj_idx; unsigned long handle = 0; - void *addr = kmap_local_page(page); + void *addr = kmap_local_zpdesc(zpdesc); - offset = get_first_obj_offset(page); + offset = get_first_obj_offset(zpdesc); offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_allocated(zpdesc, addr + offset, &handle)) break; offset += class->size; @@ -1566,14 +1579,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; - struct page *s_page = get_first_page(src_zspage); + struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { - handle = find_alloced_obj(class, s_page, &obj_idx); + handle = find_alloced_obj(class, s_zpdesc, &obj_idx); if (!handle) { - s_page = get_next_page(s_page); - if (!s_page) + s_zpdesc = get_next_zpdesc(s_zpdesc); + if (!s_zpdesc) break; obj_idx = 0; continue; @@ -1653,7 +1666,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage) */ static void lock_zspage(struct zspage *zspage) { - struct page *curr_page, *page; + struct zpdesc *curr_zpdesc, *zpdesc; /* * Pages we haven't locked yet can be migrated off the list while we're @@ -1665,24 +1678,24 @@ static void lock_zspage(struct zspage *zspage) */ while (1) { migrate_read_lock(zspage); - page = get_first_page(zspage); - if (trylock_page(page)) + zpdesc = get_first_zpdesc(zspage); + if (zpdesc_trylock(zpdesc)) break; - get_page(page); + zpdesc_get(zpdesc); migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); } - curr_page = page; - while ((page = get_next_page(curr_page))) { - if (trylock_page(page)) { - curr_page = page; + curr_zpdesc = zpdesc; + while ((zpdesc = get_next_zpdesc(curr_zpdesc))) { + if (zpdesc_trylock(zpdesc)) { + curr_zpdesc = zpdesc; } else { - get_page(page); + zpdesc_get(zpdesc); migrate_read_unlock(zspage); - wait_on_page_locked(page); - put_page(page); + zpdesc_wait_locked(zpdesc); + zpdesc_put(zpdesc); migrate_read_lock(zspage); } } @@ -1720,26 +1733,28 @@ static void migrate_write_unlock(struct zspage *zspage) static const struct movable_operations zsmalloc_mops; static void replace_sub_page(struct size_class *class, struct zspage *zspage, - struct page *newpage, struct page *oldpage) + struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc) { - struct page *page; - struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + struct zpdesc *zpdesc; + struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + unsigned int first_obj_offset; int idx = 0; - page = get_first_page(zspage); + zpdesc = get_first_zpdesc(zspage); do { - if (page == oldpage) - pages[idx] = newpage; + if (zpdesc == oldzpdesc) + zpdescs[idx] = newzpdesc; else - pages[idx] = page; + zpdescs[idx] = zpdesc; idx++; - } while ((page = get_next_page(page)) != NULL); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); - create_page_chain(class, zspage, pages); - set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + create_page_chain(class, zspage, zpdescs); + first_obj_offset = get_first_obj_offset(oldzpdesc); + set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) - newpage->index = oldpage->index; - __SetPageMovable(newpage, &zsmalloc_mops); + newzpdesc->handle = oldzpdesc->handle; + __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1759,20 +1774,22 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zs_pool *pool; struct size_class *class; struct zspage *zspage; - struct page *dummy; + struct zpdesc *dummy; + struct zpdesc *newzpdesc = page_zpdesc(newpage); + struct zpdesc *zpdesc = page_zpdesc(page); void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); /* We're committed, tell the world that this is a Zsmalloc page. */ - __SetPageZsmalloc(newpage); + __zpdesc_set_zsmalloc(newzpdesc); /* The page is locked, so this pointer must remain valid */ - zspage = get_zspage(page); + zspage = get_zspage(zpdesc); pool = zspage->pool; /* @@ -1789,30 +1806,29 @@ static int zs_page_migrate(struct page *newpage, struct page *page, /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); - offset = get_first_obj_offset(page); - s_addr = kmap_local_page(page); + offset = get_first_obj_offset(zpdesc); + s_addr = kmap_local_zpdesc(zpdesc); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_local_page(newpage); + d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - if (obj_allocated(page, addr, &handle)) { + if (obj_allocated(zpdesc, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); - new_obj = (unsigned long)location_to_obj(newpage, - obj_idx); + new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); record_obj(handle, new_obj); } } kunmap_local(s_addr); - replace_sub_page(class, zspage, newpage, page); + replace_sub_page(class, zspage, newzpdesc, zpdesc); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release migration_lock. @@ -1821,14 +1837,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page, spin_unlock(&class->lock); migrate_write_unlock(zspage); - get_page(newpage); - if (page_zone(newpage) != page_zone(page)) { - dec_zone_page_state(page, NR_ZSPAGES); - inc_zone_page_state(newpage, NR_ZSPAGES); + zpdesc_get(newzpdesc); + if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_inc_zone_page_state(newzpdesc); } - reset_page(page); - put_page(page); + reset_zpdesc(zpdesc); + zpdesc_put(zpdesc); return MIGRATEPAGE_SUCCESS; } @@ -1897,13 +1913,13 @@ static void init_deferred_free(struct zs_pool *pool) static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); do { - WARN_ON(!trylock_page(page)); - __SetPageMovable(page, &zsmalloc_mops); - unlock_page(page); - } while ((page = get_next_page(page)) != NULL); + WARN_ON(!zpdesc_trylock(zpdesc)); + __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + zpdesc_unlock(zpdesc); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } diff --git a/mm/zswap.c b/mm/zswap.c index b84c20d889b1..23365e76a3ce 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -1192,7 +1192,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. + * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ spin_unlock(&l->lock); @@ -1445,9 +1445,9 @@ resched: * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1456,7 +1456,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1483,13 +1483,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1510,13 +1514,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1526,7 +1530,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1564,20 +1567,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; |