summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig76
-rw-r--r--mm/Makefile1
-rw-r--r--mm/cma.h2
-rw-r--r--mm/compaction.c82
-rw-r--r--mm/damon/Kconfig30
-rw-r--r--mm/damon/Makefile1
-rw-r--r--mm/damon/core.c259
-rw-r--r--mm/damon/dbgfs.c1148
-rw-r--r--mm/damon/paddr.c83
-rw-r--r--mm/damon/reclaim.c2
-rw-r--r--mm/damon/sysfs-common.h16
-rw-r--r--mm/damon/sysfs-schemes.c271
-rw-r--r--mm/damon/sysfs.c188
-rw-r--r--mm/damon/tests/.kunitconfig7
-rw-r--r--mm/damon/tests/core-kunit.h14
-rw-r--r--mm/damon/tests/dbgfs-kunit.h173
-rw-r--r--mm/damon/tests/vaddr-kunit.h2
-rw-r--r--mm/damon/vaddr.c2
-rw-r--r--mm/debug.c71
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/execmem.c39
-rw-r--r--mm/filemap.c185
-rw-r--r--mm/gup.c116
-rw-r--r--mm/huge_memory.c21
-rw-r--r--mm/hugetlb.c443
-rw-r--r--mm/hugetlb_cgroup.c18
-rw-r--r--mm/hugetlb_vmemmap.c2
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h67
-rw-r--r--mm/kasan/generic.c18
-rw-r--r--mm/kasan/hw_tags.c5
-rw-r--r--mm/kasan/kasan.h18
-rw-r--r--mm/kasan/kasan_test_c.c4
-rw-r--r--mm/kasan/report.c34
-rw-r--r--mm/kasan/sw_tags.c3
-rw-r--r--mm/kfence/core.c2
-rw-r--r--mm/kfence/kfence_test.c3
-rw-r--r--mm/kfence/report.c3
-rw-r--r--mm/khugepaged.c45
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/kmsan/hooks.c1
-rw-r--r--mm/kmsan/shadow.c8
-rw-r--r--mm/ksm.c19
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memblock.c20
-rw-r--r--mm/memcontrol-v1.c22
-rw-r--r--mm/memcontrol.c124
-rw-r--r--mm/memfd.c139
-rw-r--r--mm/memory-failure.c65
-rw-r--r--mm/memory.c301
-rw-r--r--mm/memory_hotplug.c74
-rw-r--r--mm/mempolicy.c114
-rw-r--r--mm/migrate.c175
-rw-r--r--mm/migrate_device.c13
-rw-r--r--mm/mm_init.c8
-rw-r--r--mm/mmap.c575
-rw-r--r--mm/mmap_lock.c50
-rw-r--r--mm/mmu_gather.c25
-rw-r--r--mm/mseal.c6
-rw-r--r--mm/numa.c8
-rw-r--r--mm/numa_emulation.c45
-rw-r--r--mm/numa_memblks.c2
-rw-r--r--mm/oom_kill.c10
-rw-r--r--mm/page-writeback.c55
-rw-r--r--mm/page_alloc.c199
-rw-r--r--mm/page_counter.c4
-rw-r--r--mm/page_frag_cache.c6
-rw-r--r--mm/page_idle.c10
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/page_isolation.c22
-rw-r--r--mm/percpu.c74
-rw-r--r--mm/pt_reclaim.c71
-rw-r--r--mm/readahead.c49
-rw-r--r--mm/rodata_test.c7
-rw-r--r--mm/secretmem.c3
-rw-r--r--mm/shmem.c376
-rw-r--r--mm/shrinker_debug.c16
-rw-r--r--mm/slab.h38
-rw-r--r--mm/slab_common.c918
-rw-r--r--mm/slub.c361
-rw-r--r--mm/sparse-vmemmap.c5
-rw-r--r--mm/sparse.c5
-rw-r--r--mm/swap.c74
-rw-r--r--mm/swap_cgroup.c234
-rw-r--r--mm/swap_slots.c78
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c1269
-rw-r--r--mm/truncate.c53
-rw-r--r--mm/usercopy.c18
-rw-r--r--mm/userfaultfd.c156
-rw-r--r--mm/util.c182
-rw-r--r--mm/vma.c597
-rw-r--r--mm/vma.h48
-rw-r--r--mm/vma_internal.h1
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c528
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c65
-rw-r--r--mm/zpdesc.h182
-rw-r--r--mm/zsmalloc.c440
-rw-r--r--mm/zswap.c39
101 files changed, 6211 insertions, 5276 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 84000b016808..0b7f4bb5cb80 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -242,6 +242,10 @@ menu "Slab allocator options"
config SLUB
def_bool y
+config KVFREE_RCU_BATCHED
+ def_bool y
+ depends on !SLUB_TINY && !TINY_RCU
+
config SLUB_TINY
bool "Configure for minimal memory footprint"
depends on EXPERT
@@ -550,20 +554,63 @@ menuconfig MEMORY_HOTPLUG
if MEMORY_HOTPLUG
-config MEMORY_HOTPLUG_DEFAULT_ONLINE
- bool "Online the newly added memory blocks by default"
- depends on MEMORY_HOTPLUG
+choice
+ prompt "Memory Hotplug Default Online Type"
+ default MHP_DEFAULT_ONLINE_TYPE_OFFLINE
help
+ Default memory type for hotplugged memory.
+
This option sets the default policy setting for memory hotplug
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
determines what happens to newly added memory regions. Policy setting
can always be changed at runtime.
+
+ The default is 'offline'.
+
+ Select offline to defer onlining to drivers and user policy.
+ Select auto to let the kernel choose what zones to utilize.
+ Select online_kernel to generally allow kernel usage of this memory.
+ Select online_movable to generally disallow kernel usage of this memory.
+
+ Example kernel usage would be page structs and page tables.
+
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
- Say Y here if you want all hot-plugged memory blocks to appear in
- 'online' state by default.
- Say N here if you want the default policy to keep all hot-plugged
- memory blocks in 'offline' state.
+config MHP_DEFAULT_ONLINE_TYPE_OFFLINE
+ bool "offline"
+ help
+ Hotplugged memory will not be onlined by default.
+ Choose this for systems with drivers and user policy that
+ handle onlining of hotplug memory policy.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO
+ bool "auto"
+ help
+ Select this if you want the kernel to automatically online
+ hotplugged memory into the zone it thinks is reasonable.
+ This memory may be utilized for kernel data.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL
+ bool "kernel"
+ help
+ Select this if you want the kernel to automatically online
+ hotplugged memory into a zone capable of being used for kernel
+ data. This typically means ZONE_NORMAL.
+
+config MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE
+ bool "movable"
+ help
+ Select this if you want the kernel to automatically online
+ hotplug memory into ZONE_MOVABLE. This memory will generally
+ not be utilized for kernel data.
+
+ This should only be used when the admin knows sufficient
+ ZONE_NORMAL memory is available to describe hotplug memory,
+ otherwise hotplug memory may fail to online. For example,
+ sufficient kernel-capable memory (ZONE_NORMAL) must be
+ available to allocate page structs to describe ZONE_MOVABLE.
+
+endchoice
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
@@ -1301,6 +1348,21 @@ config ARCH_HAS_USER_SHADOW_STACK
The architecture has hardware support for userspace shadow call
stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+config ARCH_SUPPORTS_PT_RECLAIM
+ def_bool n
+
+config PT_RECLAIM
+ bool "reclaim empty user page table pages"
+ default y
+ depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP
+ select MMU_GATHER_RCU_TABLE_FREE
+ help
+ Try to reclaim empty user page table pages in paths other than munmap
+ and exit_mmap path.
+
+ Note: now only empty user PTE page table pages will be reclaimed.
+
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index dba52bb0da8a..850386a67b3e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -146,3 +146,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
+obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
diff --git a/mm/cma.h b/mm/cma.h
index ad61cc6dd439..8485ef893e99 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -36,7 +36,7 @@ struct cma {
};
extern struct cma cma_areas[MAX_CMA_AREAS];
-extern unsigned cma_area_count;
+extern unsigned int cma_area_count;
static inline unsigned long cma_bitmap_maxno(struct cma *cma)
{
diff --git a/mm/compaction.c b/mm/compaction.c
index a2b16b08cbbf..a3203d97123e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; }
static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags)
{
post_alloc_hook(page, order, __GFP_MOVABLE);
+ set_page_refcounted(page);
return page;
}
#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__))
@@ -630,7 +631,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (PageCompound(page)) {
const unsigned int order = compound_order(page);
- if (blockpfn + (1UL << order) <= end_pfn) {
+ if ((order <= MAX_PAGE_ORDER) &&
+ (blockpfn + (1UL << order) <= end_pfn)) {
blockpfn += (1UL << order) - 1;
page += (1UL << order) - 1;
nr_scanned += (1UL << order) - 1;
@@ -1868,6 +1870,7 @@ again:
dst = (struct folio *)freepage;
post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
+ set_page_refcounted(&dst->page);
if (order)
prep_compound_page(&dst->page, order);
cc->nr_freepages -= 1 << order;
@@ -2488,7 +2491,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
static enum compact_result
compaction_suit_allocation_order(struct zone *zone, unsigned int order,
- int highest_zoneidx, unsigned int alloc_flags)
+ int highest_zoneidx, unsigned int alloc_flags,
+ bool async)
{
unsigned long watermark;
@@ -2497,6 +2501,23 @@ compaction_suit_allocation_order(struct zone *zone, unsigned int order,
alloc_flags))
return COMPACT_SUCCESS;
+ /*
+ * For unmovable allocations (without ALLOC_CMA), check if there is enough
+ * free memory in the non-CMA pageblocks. Otherwise compaction could form
+ * the high-order page in CMA pageblocks, which would not help the
+ * allocation to succeed. However, limit the check to costly order async
+ * compaction (such as opportunistic THP attempts) because there is the
+ * possibility that compaction would migrate pages from non-CMA to CMA
+ * pageblock.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && async &&
+ !(alloc_flags & ALLOC_CMA)) {
+ watermark = low_wmark_pages(zone) + compact_gap(order);
+ if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
+ 0, zone_page_state(zone, NR_FREE_PAGES)))
+ return COMPACT_SKIPPED;
+ }
+
if (!compaction_suitable(zone, order, highest_zoneidx))
return COMPACT_SKIPPED;
@@ -2532,7 +2553,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
if (!is_via_compact_memory(cc->order)) {
ret = compaction_suit_allocation_order(cc->zone, cc->order,
cc->highest_zoneidx,
- cc->alloc_flags);
+ cc->alloc_flags,
+ cc->mode == MIGRATE_ASYNC);
if (ret != COMPACT_CONTINUE)
return ret;
}
@@ -3035,7 +3057,8 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
ret = compaction_suit_allocation_order(zone,
pgdat->kcompactd_max_order,
- highest_zoneidx, ALLOC_WMARK_MIN);
+ highest_zoneidx, ALLOC_WMARK_MIN,
+ false);
if (ret == COMPACT_CONTINUE)
return true;
}
@@ -3076,7 +3099,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
continue;
ret = compaction_suit_allocation_order(zone,
- cc.order, zoneid, ALLOC_WMARK_MIN);
+ cc.order, zoneid, ALLOC_WMARK_MIN,
+ false);
if (ret != COMPACT_CONTINUE)
continue;
@@ -3154,15 +3178,10 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t *)p;
- struct task_struct *tsk = current;
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
-
+ current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@@ -3219,6 +3238,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
+ current->flags &= ~PF_KCOMPACTD;
+
return 0;
}
@@ -3233,10 +3254,12 @@ void __meminit kcompactd_run(int nid)
if (pgdat->kcompactd)
return;
- pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
pgdat->kcompactd = NULL;
+ } else {
+ wake_up_process(pgdat->kcompactd);
}
}
@@ -3254,30 +3277,6 @@ void __meminit kcompactd_stop(int nid)
}
}
-/*
- * It's optimal to keep kcompactd on the same CPUs as their memory, but
- * not required for correctness. So if the last cpu in a node goes
- * away, we get changed to run anywhere: as the first one comes back,
- * restore their cpu bindings.
- */
-static int kcompactd_cpu_online(unsigned int cpu)
-{
- int nid;
-
- for_each_node_state(nid, N_MEMORY) {
- pg_data_t *pgdat = NODE_DATA(nid);
- const struct cpumask *mask;
-
- mask = cpumask_of_node(pgdat->node_id);
-
- if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
- /* One of our CPUs online: restore mask */
- if (pgdat->kcompactd)
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
- }
- return 0;
-}
-
static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
int write, void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3297,7 +3296,7 @@ static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table,
return ret;
}
-static struct ctl_table vm_compaction[] = {
+static const struct ctl_table vm_compaction[] = {
{
.procname = "compact_memory",
.data = &sysctl_compact_memory,
@@ -3337,15 +3336,6 @@ static struct ctl_table vm_compaction[] = {
static int __init kcompactd_init(void)
{
int nid;
- int ret;
-
- ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
- "mm/compaction:online",
- kcompactd_cpu_online, NULL);
- if (ret < 0) {
- pr_err("kcompactd: failed to register hotplug callbacks.\n");
- return ret;
- }
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index d0357f3e9372..c213cf8b5638 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -71,36 +71,6 @@ config DAMON_SYSFS_KUNIT_TEST
If unsure, say N.
-config DAMON_DBGFS_DEPRECATED
- bool "DAMON debugfs interface (DEPRECATED!)"
- depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
- help
- This builds the debugfs interface for DAMON. The user space admins
- can use the interface for arbitrary data access monitoring.
-
- If unsure, say N.
-
- This is deprecated, so users should move to the sysfs interface
- (DAMON_SYSFS). If you depend on this and cannot move, please report
- your usecase to damon@lists.linux.dev and linux-mm@kvack.org.
-
-config DAMON_DBGFS
- bool
- default y
- depends on DAMON_DBGFS_DEPRECATED
-
-config DAMON_DBGFS_KUNIT_TEST
- bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
- depends on DAMON_DBGFS && KUNIT=y
- default KUNIT_ALL_TESTS
- help
- This builds the DAMON debugfs interface Kunit test suite.
-
- For more information on KUnit and unit tests in general, please refer
- to the KUnit documentation.
-
- If unsure, say N.
-
config DAMON_RECLAIM
bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
depends on DAMON_PADDR
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f7add3f4aa79..8b49012ba8c3 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -4,6 +4,5 @@ obj-y := core.o
obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o
-obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o
obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 0776452a1abb..384935ef4e65 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -14,6 +14,7 @@
#include <linux/psi.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -266,7 +267,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
}
struct damos_filter *damos_new_filter(enum damos_filter_type type,
- bool matching)
+ bool matching, bool allow)
{
struct damos_filter *filter;
@@ -275,6 +276,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
return NULL;
filter->type = type;
filter->matching = matching;
+ filter->allow = allow;
INIT_LIST_HEAD(&filter->list);
return filter;
}
@@ -371,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
* or damon_attrs are updated.
*/
scheme->next_apply_sis = 0;
+ scheme->walk_completed = false;
INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -504,6 +507,8 @@ struct damon_ctx *damon_new_ctx(void)
ctx->next_ops_update_sis = 0;
mutex_init(&ctx->kdamond_lock);
+ mutex_init(&ctx->call_control_lock);
+ mutex_init(&ctx->walk_control_lock);
ctx->attrs.min_nr_regions = 10;
ctx->attrs.max_nr_regions = 1000;
@@ -803,7 +808,8 @@ static int damos_commit_filters(struct damos *dst, struct damos *src)
continue;
new_filter = damos_new_filter(
- src_filter->type, src_filter->matching);
+ src_filter->type, src_filter->matching,
+ src_filter->allow);
if (!new_filter)
return -ENOMEM;
damos_commit_filter_arg(new_filter, src_filter);
@@ -1162,6 +1168,94 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
return err;
}
+static bool damon_is_running(struct damon_ctx *ctx)
+{
+ bool running;
+
+ mutex_lock(&ctx->kdamond_lock);
+ running = ctx->kdamond != NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+ return running;
+}
+
+/**
+ * damon_call() - Invoke a given function on DAMON worker thread (kdamond).
+ * @ctx: DAMON context to call the function for.
+ * @control: Control variable of the call request.
+ *
+ * Ask DAMON worker thread (kdamond) of @ctx to call a function with an
+ * argument data that respectively passed via &damon_call_control->fn and
+ * &damon_call_control->data of @control, and wait until the kdamond finishes
+ * handling of the request.
+ *
+ * The kdamond executes the function with the argument in the main loop, just
+ * after a sampling of the iteration is finished. The function can hence
+ * safely access the internal data of the &struct damon_ctx without additional
+ * synchronization. The return value of the function will be saved in
+ * &damon_call_control->return_code.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
+{
+ init_completion(&control->completion);
+ control->canceled = false;
+
+ mutex_lock(&ctx->call_control_lock);
+ if (ctx->call_control) {
+ mutex_unlock(&ctx->call_control_lock);
+ return -EBUSY;
+ }
+ ctx->call_control = control;
+ mutex_unlock(&ctx->call_control_lock);
+ if (!damon_is_running(ctx))
+ return -EINVAL;
+ wait_for_completion(&control->completion);
+ if (control->canceled)
+ return -ECANCELED;
+ return 0;
+}
+
+/**
+ * damos_walk() - Invoke a given functions while DAMOS walk regions.
+ * @ctx: DAMON context to call the functions for.
+ * @control: Control variable of the walk request.
+ *
+ * Ask DAMON worker thread (kdamond) of @ctx to call a function for each region
+ * that the kdamond will apply DAMOS action to, and wait until the kdamond
+ * finishes handling of the request.
+ *
+ * The kdamond executes the given function in the main loop, for each region
+ * just after it applied any DAMOS actions of @ctx to it. The invocation is
+ * made only within one &damos->apply_interval_us since damos_walk()
+ * invocation, for each scheme. The given callback function can hence safely
+ * access the internal data of &struct damon_ctx and &struct damon_region that
+ * each of the scheme will apply the action for next interval, without
+ * additional synchronizations against the kdamond. If every scheme of @ctx
+ * passed at least one &damos->apply_interval_us, kdamond marks the request as
+ * completed so that damos_walk() can wakeup and return.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control)
+{
+ init_completion(&control->completion);
+ control->canceled = false;
+ mutex_lock(&ctx->walk_control_lock);
+ if (ctx->walk_control) {
+ mutex_unlock(&ctx->walk_control_lock);
+ return -EBUSY;
+ }
+ ctx->walk_control = control;
+ mutex_unlock(&ctx->walk_control_lock);
+ if (!damon_is_running(ctx))
+ return -EINVAL;
+ wait_for_completion(&control->completion);
+ if (control->canceled)
+ return -ECANCELED;
+ return 0;
+}
+
/*
* Reset the aggregated monitoring results ('nr_accesses' of each region).
*/
@@ -1272,16 +1366,18 @@ static bool damos_skip_charged_region(struct damon_target *t,
}
static void damos_update_stat(struct damos *s,
- unsigned long sz_tried, unsigned long sz_applied)
+ unsigned long sz_tried, unsigned long sz_applied,
+ unsigned long sz_ops_filter_passed)
{
s->stat.nr_tried++;
s->stat.sz_tried += sz_tried;
if (sz_applied)
s->stat.nr_applied++;
s->stat.sz_applied += sz_applied;
+ s->stat.sz_ops_filter_passed += sz_ops_filter_passed;
}
-static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
+static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t,
struct damon_region *r, struct damos_filter *filter)
{
bool matched = false;
@@ -1334,13 +1430,103 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
{
struct damos_filter *filter;
+ s->core_filters_allowed = false;
damos_for_each_filter(filter, s) {
- if (__damos_filter_out(ctx, t, r, filter))
- return true;
+ if (damos_filter_match(ctx, t, r, filter)) {
+ if (filter->allow)
+ s->core_filters_allowed = true;
+ return !filter->allow;
+ }
}
return false;
}
+/*
+ * damos_walk_call_walk() - Call &damos_walk_control->walk_fn.
+ * @ctx: The context of &damon_ctx->walk_control.
+ * @t: The monitoring target of @r that @s will be applied.
+ * @r: The region of @t that @s will be applied.
+ * @s: The scheme of @ctx that will be applied to @r.
+ *
+ * This function is called from kdamond whenever it asked the operation set to
+ * apply a DAMOS scheme action to a region. If a DAMOS walk request is
+ * installed by damos_walk() and not yet uninstalled, invoke it.
+ */
+static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s,
+ unsigned long sz_filter_passed)
+{
+ struct damos_walk_control *control;
+
+ mutex_lock(&ctx->walk_control_lock);
+ control = ctx->walk_control;
+ mutex_unlock(&ctx->walk_control_lock);
+ if (!control)
+ return;
+ control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed);
+}
+
+/*
+ * damos_walk_complete() - Complete DAMOS walk request if all walks are done.
+ * @ctx: The context of &damon_ctx->walk_control.
+ * @s: A scheme of @ctx that all walks are now done.
+ *
+ * This function is called when kdamond finished applying the action of a DAMOS
+ * scheme to all regions that eligible for the given &damos->apply_interval_us.
+ * If every scheme of @ctx including @s now finished walking for at least one
+ * &damos->apply_interval_us, this function makrs the handling of the given
+ * DAMOS walk request is done, so that damos_walk() can wake up and return.
+ */
+static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s)
+{
+ struct damos *siter;
+ struct damos_walk_control *control;
+
+ mutex_lock(&ctx->walk_control_lock);
+ control = ctx->walk_control;
+ mutex_unlock(&ctx->walk_control_lock);
+ if (!control)
+ return;
+
+ s->walk_completed = true;
+ /* if all schemes completed, signal completion to walker */
+ damon_for_each_scheme(siter, ctx) {
+ if (!siter->walk_completed)
+ return;
+ }
+ complete(&control->completion);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control = NULL;
+ mutex_unlock(&ctx->walk_control_lock);
+}
+
+/*
+ * damos_walk_cancel() - Cancel the current DAMOS walk request.
+ * @ctx: The context of &damon_ctx->walk_control.
+ *
+ * This function is called when @ctx is deactivated by DAMOS watermarks, DAMOS
+ * walk is requested but there is no DAMOS scheme to walk for, or the kdamond
+ * is already out of the main loop and therefore gonna be terminated, and hence
+ * cannot continue the walks. This function therefore marks the walk request
+ * as canceled, so that damos_walk() can wake up and return.
+ */
+static void damos_walk_cancel(struct damon_ctx *ctx)
+{
+ struct damos_walk_control *control;
+
+ mutex_lock(&ctx->walk_control_lock);
+ control = ctx->walk_control;
+ mutex_unlock(&ctx->walk_control_lock);
+
+ if (!control)
+ return;
+ control->canceled = true;
+ complete(&control->completion);
+ mutex_lock(&ctx->walk_control_lock);
+ ctx->walk_control = NULL;
+ mutex_unlock(&ctx->walk_control_lock);
+}
+
static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
@@ -1348,6 +1534,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
+ unsigned long sz_ops_filter_passed = 0;
int err = 0;
/*
* We plan to support multiple context per kdamond, as DAMON sysfs
@@ -1393,8 +1580,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
if (!err) {
trace_damos_before_apply(cidx, sidx, tidx, r,
damon_nr_regions(t), do_trace);
- sz_applied = c->ops.apply_scheme(c, t, r, s);
+ sz_applied = c->ops.apply_scheme(c, t, r, s,
+ &sz_ops_filter_passed);
}
+ damos_walk_call_walk(c, t, r, s, sz_ops_filter_passed);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
@@ -1408,7 +1597,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
r->age = 0;
update_stat:
- damos_update_stat(s, sz, sz_applied);
+ damos_update_stat(s, sz, sz_applied, sz_ops_filter_passed);
}
static void damon_do_apply_schemes(struct damon_ctx *c,
@@ -1550,7 +1739,7 @@ static unsigned long damos_quota_score(struct damos_quota *quota)
static void damos_set_effective_quota(struct damos_quota *quota)
{
unsigned long throughput;
- unsigned long esz;
+ unsigned long esz = ULONG_MAX;
if (!quota->ms && list_empty(&quota->goals)) {
quota->esz = quota->sz;
@@ -1572,10 +1761,7 @@ static void damos_set_effective_quota(struct damos_quota *quota)
quota->total_charged_ns;
else
throughput = PAGE_SIZE * 1024;
- if (!list_empty(&quota->goals))
- esz = min(throughput * quota->ms, esz);
- else
- esz = throughput * quota->ms;
+ esz = min(throughput * quota->ms, esz);
}
if (quota->sz && quota->sz < esz)
@@ -1666,6 +1852,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
damon_for_each_scheme(s, c) {
if (c->passed_sample_intervals < s->next_apply_sis)
continue;
+ damos_walk_complete(c, s);
s->next_apply_sis = c->passed_sample_intervals +
(s->apply_interval_us ? s->apply_interval_us :
c->attrs.aggr_interval) / sample_interval;
@@ -1894,9 +2081,8 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
if (scheme->wmarks.activated)
pr_debug("deactivate a scheme (%d) for %s wmark\n",
- scheme->action,
- metric > scheme->wmarks.high ?
- "high" : "low");
+ scheme->action,
+ str_high_low(metric > scheme->wmarks.high));
scheme->wmarks.activated = false;
return scheme->wmarks.interval;
}
@@ -1920,6 +2106,39 @@ static void kdamond_usleep(unsigned long usecs)
usleep_range_idle(usecs, usecs + 1);
}
+/*
+ * kdamond_call() - handle damon_call_control.
+ * @ctx: The &struct damon_ctx of the kdamond.
+ * @cancel: Whether to cancel the invocation of the function.
+ *
+ * If there is a &struct damon_call_control request that registered via
+ * &damon_call() on @ctx, do or cancel the invocation of the function depending
+ * on @cancel. @cancel is set when the kdamond is deactivated by DAMOS
+ * watermarks, or the kdamond is already out of the main loop and therefore
+ * will be terminated.
+ */
+static void kdamond_call(struct damon_ctx *ctx, bool cancel)
+{
+ struct damon_call_control *control;
+ int ret = 0;
+
+ mutex_lock(&ctx->call_control_lock);
+ control = ctx->call_control;
+ mutex_unlock(&ctx->call_control_lock);
+ if (!control)
+ return;
+ if (cancel) {
+ control->canceled = true;
+ } else {
+ ret = control->fn(control->data);
+ control->return_code = ret;
+ }
+ complete(&control->completion);
+ mutex_lock(&ctx->call_control_lock);
+ ctx->call_control = NULL;
+ mutex_unlock(&ctx->call_control_lock);
+}
+
/* Returns negative error code if it's not activated but should return */
static int kdamond_wait_activation(struct damon_ctx *ctx)
{
@@ -1944,6 +2163,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
if (ctx->callback.after_wmarks_check &&
ctx->callback.after_wmarks_check(ctx))
break;
+ kdamond_call(ctx, true);
+ damos_walk_cancel(ctx);
}
return -EBUSY;
}
@@ -2014,6 +2235,7 @@ static int kdamond_fn(void *data)
if (ctx->callback.after_sampling &&
ctx->callback.after_sampling(ctx))
break;
+ kdamond_call(ctx, false);
kdamond_usleep(sample_interval);
ctx->passed_sample_intervals++;
@@ -2036,6 +2258,8 @@ static int kdamond_fn(void *data)
*/
if (!list_empty(&ctx->schemes))
kdamond_apply_schemes(ctx);
+ else
+ damos_walk_cancel(ctx);
sample_interval = ctx->attrs.sample_interval ?
ctx->attrs.sample_interval : 1;
@@ -2075,6 +2299,9 @@ done:
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
+ kdamond_call(ctx, true);
+ damos_walk_cancel(ctx);
+
mutex_lock(&damon_lock);
nr_running_ctxs--;
if (!nr_running_ctxs && running_exclusive_ctxs)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
deleted file mode 100644
index b4213bc47e44..000000000000
--- a/mm/damon/dbgfs.c
+++ /dev/null
@@ -1,1148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DAMON Debugfs Interface
- *
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#define pr_fmt(fmt) "damon-dbgfs: " fmt
-
-#include <linux/damon.h>
-#include <linux/debugfs.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/page_idle.h>
-#include <linux/slab.h>
-
-#define DAMON_DBGFS_DEPRECATION_NOTICE \
- "DAMON debugfs interface is deprecated, so users should move " \
- "to DAMON_SYSFS. If you cannot, please report your usecase to " \
- "damon@lists.linux.dev and linux-mm@kvack.org.\n"
-
-static struct damon_ctx **dbgfs_ctxs;
-static int dbgfs_nr_ctxs;
-static struct dentry **dbgfs_dirs;
-static DEFINE_MUTEX(damon_dbgfs_lock);
-
-static void damon_dbgfs_warn_deprecation(void)
-{
- pr_warn_once(DAMON_DBGFS_DEPRECATION_NOTICE);
-}
-
-/*
- * Returns non-empty string on success, negative error code otherwise.
- */
-static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- ssize_t ret;
-
- /* We do not accept continuous write */
- if (*ppos)
- return ERR_PTR(-EINVAL);
-
- kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return ERR_PTR(-ENOMEM);
-
- ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count);
- if (ret != count) {
- kfree(kbuf);
- return ERR_PTR(-EIO);
- }
- kbuf[ret] = '\0';
-
- return kbuf;
-}
-
-static ssize_t dbgfs_attrs_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char kbuf[128];
- int ret;
-
- mutex_lock(&ctx->kdamond_lock);
- ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
- ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
- ctx->attrs.ops_update_interval,
- ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
- mutex_unlock(&ctx->kdamond_lock);
-
- return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
-}
-
-static ssize_t dbgfs_attrs_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- struct damon_attrs attrs;
- char *kbuf;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
- &attrs.sample_interval, &attrs.aggr_interval,
- &attrs.ops_update_interval,
- &attrs.min_nr_regions,
- &attrs.max_nr_regions) != 5) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- ret = damon_set_attrs(ctx, &attrs);
- if (!ret)
- ret = count;
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
-out:
- kfree(kbuf);
- return ret;
-}
-
-/*
- * Return corresponding dbgfs' scheme action value (int) for the given
- * damos_action if the given damos_action value is valid and supported by
- * dbgfs, negative error code otherwise.
- */
-static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
-{
- switch (action) {
- case DAMOS_WILLNEED:
- return 0;
- case DAMOS_COLD:
- return 1;
- case DAMOS_PAGEOUT:
- return 2;
- case DAMOS_HUGEPAGE:
- return 3;
- case DAMOS_NOHUGEPAGE:
- return 4;
- case DAMOS_STAT:
- return 5;
- default:
- return -EINVAL;
- }
-}
-
-static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
-{
- struct damos *s;
- int written = 0;
- int rc;
-
- damon_for_each_scheme(s, c) {
- rc = scnprintf(&buf[written], len - written,
- "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
- s->pattern.min_sz_region,
- s->pattern.max_sz_region,
- s->pattern.min_nr_accesses,
- s->pattern.max_nr_accesses,
- s->pattern.min_age_region,
- s->pattern.max_age_region,
- damos_action_to_dbgfs_scheme_action(s->action),
- s->quota.ms, s->quota.sz,
- s->quota.reset_interval,
- s->quota.weight_sz,
- s->quota.weight_nr_accesses,
- s->quota.weight_age,
- s->wmarks.metric, s->wmarks.interval,
- s->wmarks.high, s->wmarks.mid, s->wmarks.low,
- s->stat.nr_tried, s->stat.sz_tried,
- s->stat.nr_applied, s->stat.sz_applied,
- s->stat.qt_exceeds);
- if (!rc)
- return -ENOMEM;
-
- written += rc;
- }
- return written;
-}
-
-static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- len = sprint_schemes(ctx, kbuf, count);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
-{
- ssize_t i;
-
- for (i = 0; i < nr_schemes; i++)
- kfree(schemes[i]);
- kfree(schemes);
-}
-
-/*
- * Return corresponding damos_action for the given dbgfs input for a scheme
- * action if the input is valid, negative error code otherwise.
- */
-static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
-{
- switch (dbgfs_action) {
- case 0:
- return DAMOS_WILLNEED;
- case 1:
- return DAMOS_COLD;
- case 2:
- return DAMOS_PAGEOUT;
- case 3:
- return DAMOS_HUGEPAGE;
- case 4:
- return DAMOS_NOHUGEPAGE;
- case 5:
- return DAMOS_STAT;
- default:
- return -EINVAL;
- }
-}
-
-/*
- * Converts a string into an array of struct damos pointers
- *
- * Returns an array of struct damos pointers that converted if the conversion
- * success, or NULL otherwise.
- */
-static struct damos **str_to_schemes(const char *str, ssize_t len,
- ssize_t *nr_schemes)
-{
- struct damos *scheme, **schemes;
- const int max_nr_schemes = 256;
- int pos = 0, parsed, ret;
- unsigned int action_input;
- enum damos_action action;
-
- schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
- GFP_KERNEL);
- if (!schemes)
- return NULL;
-
- *nr_schemes = 0;
- while (pos < len && *nr_schemes < max_nr_schemes) {
- struct damos_access_pattern pattern = {};
- struct damos_quota quota = {};
- struct damos_watermarks wmarks;
-
- ret = sscanf(&str[pos],
- "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
- &pattern.min_sz_region, &pattern.max_sz_region,
- &pattern.min_nr_accesses,
- &pattern.max_nr_accesses,
- &pattern.min_age_region,
- &pattern.max_age_region,
- &action_input, &quota.ms,
- &quota.sz, &quota.reset_interval,
- &quota.weight_sz, &quota.weight_nr_accesses,
- &quota.weight_age, &wmarks.metric,
- &wmarks.interval, &wmarks.high, &wmarks.mid,
- &wmarks.low, &parsed);
- if (ret != 18)
- break;
- action = dbgfs_scheme_action_to_damos_action(action_input);
- if ((int)action < 0)
- goto fail;
-
- if (pattern.min_sz_region > pattern.max_sz_region ||
- pattern.min_nr_accesses > pattern.max_nr_accesses ||
- pattern.min_age_region > pattern.max_age_region)
- goto fail;
-
- if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
- wmarks.mid < wmarks.low)
- goto fail;
-
- pos += parsed;
- scheme = damon_new_scheme(&pattern, action, 0, &quota,
- &wmarks, NUMA_NO_NODE);
- if (!scheme)
- goto fail;
-
- schemes[*nr_schemes] = scheme;
- *nr_schemes += 1;
- }
- return schemes;
-fail:
- free_schemes_arr(schemes, *nr_schemes);
- return NULL;
-}
-
-static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- struct damos **schemes;
- ssize_t nr_schemes = 0, ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- schemes = str_to_schemes(kbuf, count, &nr_schemes);
- if (!schemes) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- damon_set_schemes(ctx, schemes, nr_schemes);
- ret = count;
- nr_schemes = 0;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- free_schemes_arr(schemes, nr_schemes);
-out:
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
-{
- struct damon_target *t;
- int id;
- int written = 0;
- int rc;
-
- damon_for_each_target(t, ctx) {
- if (damon_target_has_pid(ctx))
- /* Show pid numbers to debugfs users */
- id = pid_vnr(t->pid);
- else
- /* Show 42 for physical address space, just for fun */
- id = 42;
-
- rc = scnprintf(&buf[written], len - written, "%d ", id);
- if (!rc)
- return -ENOMEM;
- written += rc;
- }
- if (written)
- written -= 1;
- written += scnprintf(&buf[written], len - written, "\n");
- return written;
-}
-
-static ssize_t dbgfs_target_ids_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- ssize_t len;
- char ids_buf[320];
-
- mutex_lock(&ctx->kdamond_lock);
- len = sprint_target_ids(ctx, ids_buf, 320);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- return len;
-
- return simple_read_from_buffer(buf, count, ppos, ids_buf, len);
-}
-
-/*
- * Converts a string into an integers array
- *
- * Returns an array of integers array if the conversion success, or NULL
- * otherwise.
- */
-static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
-{
- int *array;
- const int max_nr_ints = 32;
- int nr;
- int pos = 0, parsed, ret;
-
- *nr_ints = 0;
- array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
- if (!array)
- return NULL;
- while (*nr_ints < max_nr_ints && pos < len) {
- ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
- pos += parsed;
- if (ret != 1)
- break;
- array[*nr_ints] = nr;
- *nr_ints += 1;
- }
-
- return array;
-}
-
-static void dbgfs_put_pids(struct pid **pids, int nr_pids)
-{
- int i;
-
- for (i = 0; i < nr_pids; i++)
- put_pid(pids[i]);
-}
-
-/*
- * Converts a string into an struct pid pointers array
- *
- * Returns an array of struct pid pointers if the conversion success, or NULL
- * otherwise.
- */
-static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
-{
- int *ints;
- ssize_t nr_ints;
- struct pid **pids;
-
- *nr_pids = 0;
-
- ints = str_to_ints(str, len, &nr_ints);
- if (!ints)
- return NULL;
-
- pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
- if (!pids)
- goto out;
-
- for (; *nr_pids < nr_ints; (*nr_pids)++) {
- pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
- if (!pids[*nr_pids]) {
- dbgfs_put_pids(pids, *nr_pids);
- kfree(ints);
- kfree(pids);
- return NULL;
- }
- }
-
-out:
- kfree(ints);
- return pids;
-}
-
-/*
- * dbgfs_set_targets() - Set monitoring targets.
- * @ctx: monitoring context
- * @nr_targets: number of targets
- * @pids: array of target pids (size is same to @nr_targets)
- *
- * This function should not be called while the kdamond is running. @pids is
- * ignored if the context is not configured to have pid in each target. On
- * failure, reference counts of all pids in @pids are decremented.
- *
- * Return: 0 on success, negative error code otherwise.
- */
-static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
- struct pid **pids)
-{
- ssize_t i;
- struct damon_target *t, *next;
-
- damon_for_each_target_safe(t, next, ctx) {
- if (damon_target_has_pid(ctx))
- put_pid(t->pid);
- damon_destroy_target(t);
- }
-
- for (i = 0; i < nr_targets; i++) {
- t = damon_new_target();
- if (!t) {
- damon_for_each_target_safe(t, next, ctx)
- damon_destroy_target(t);
- if (damon_target_has_pid(ctx))
- dbgfs_put_pids(pids, nr_targets);
- return -ENOMEM;
- }
- if (damon_target_has_pid(ctx))
- t->pid = pids[i];
- damon_add_target(ctx, t);
- }
-
- return 0;
-}
-
-static ssize_t dbgfs_target_ids_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- bool id_is_pid = true;
- char *kbuf;
- struct pid **target_pids = NULL;
- ssize_t nr_targets;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- if (!strncmp(kbuf, "paddr\n", count)) {
- id_is_pid = false;
- nr_targets = 1;
- }
-
- if (id_is_pid) {
- target_pids = str_to_pids(kbuf, count, &nr_targets);
- if (!target_pids) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- if (id_is_pid)
- dbgfs_put_pids(target_pids, nr_targets);
- ret = -EBUSY;
- goto unlock_out;
- }
-
- /* remove previously set targets */
- dbgfs_set_targets(ctx, 0, NULL);
- if (!nr_targets) {
- ret = count;
- goto unlock_out;
- }
-
- /* Configure the context for the address space type */
- if (id_is_pid)
- ret = damon_select_ops(ctx, DAMON_OPS_VADDR);
- else
- ret = damon_select_ops(ctx, DAMON_OPS_PADDR);
- if (ret)
- goto unlock_out;
-
- ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
- if (!ret)
- ret = count;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- kfree(target_pids);
-out:
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
-{
- struct damon_target *t;
- struct damon_region *r;
- int target_idx = 0;
- int written = 0;
- int rc;
-
- damon_for_each_target(t, c) {
- damon_for_each_region(r, t) {
- rc = scnprintf(&buf[written], len - written,
- "%d %lu %lu\n",
- target_idx, r->ar.start, r->ar.end);
- if (!rc)
- return -ENOMEM;
- written += rc;
- }
- target_idx++;
- }
- return written;
-}
-
-static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- mutex_unlock(&ctx->kdamond_lock);
- len = -EBUSY;
- goto out;
- }
-
- len = sprint_init_regions(ctx, kbuf, count);
- mutex_unlock(&ctx->kdamond_lock);
- if (len < 0)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static int add_init_region(struct damon_ctx *c, int target_idx,
- struct damon_addr_range *ar)
-{
- struct damon_target *t;
- struct damon_region *r, *prev;
- unsigned long idx = 0;
- int rc = -EINVAL;
-
- if (ar->start >= ar->end)
- return -EINVAL;
-
- damon_for_each_target(t, c) {
- if (idx++ == target_idx) {
- r = damon_new_region(ar->start, ar->end);
- if (!r)
- return -ENOMEM;
- damon_add_region(r, t);
- if (damon_nr_regions(t) > 1) {
- prev = damon_prev_region(r);
- if (prev->ar.end > r->ar.start) {
- damon_destroy_region(r, t);
- return -EINVAL;
- }
- }
- rc = 0;
- }
- }
- return rc;
-}
-
-static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
-{
- struct damon_target *t;
- struct damon_region *r, *next;
- int pos = 0, parsed, ret;
- int target_idx;
- struct damon_addr_range ar;
- int err;
-
- damon_for_each_target(t, c) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
-
- while (pos < len) {
- ret = sscanf(&str[pos], "%d %lu %lu%n",
- &target_idx, &ar.start, &ar.end, &parsed);
- if (ret != 3)
- break;
- err = add_init_region(c, target_idx, &ar);
- if (err)
- goto fail;
- pos += parsed;
- }
-
- return 0;
-
-fail:
- damon_for_each_target(t, c) {
- damon_for_each_region_safe(r, next, t)
- damon_destroy_region(r, t);
- }
- return err;
-}
-
-static ssize_t dbgfs_init_regions_write(struct file *file,
- const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t ret = count;
- int err;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ret = -EBUSY;
- goto unlock_out;
- }
-
- err = set_init_regions(ctx, kbuf, ret);
- if (err)
- ret = err;
-
-unlock_out:
- mutex_unlock(&ctx->kdamond_lock);
- kfree(kbuf);
- return ret;
-}
-
-static ssize_t dbgfs_kdamond_pid_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- struct damon_ctx *ctx = file->private_data;
- char *kbuf;
- ssize_t len;
-
- kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
- if (!kbuf)
- return -ENOMEM;
-
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond)
- len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid);
- else
- len = scnprintf(kbuf, count, "none\n");
- mutex_unlock(&ctx->kdamond_lock);
- if (!len)
- goto out;
- len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
-
-out:
- kfree(kbuf);
- return len;
-}
-
-static int damon_dbgfs_open(struct inode *inode, struct file *file)
-{
- damon_dbgfs_warn_deprecation();
-
- file->private_data = inode->i_private;
-
- return nonseekable_open(inode, file);
-}
-
-static const struct file_operations attrs_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_attrs_read,
- .write = dbgfs_attrs_write,
-};
-
-static const struct file_operations schemes_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_schemes_read,
- .write = dbgfs_schemes_write,
-};
-
-static const struct file_operations target_ids_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_target_ids_read,
- .write = dbgfs_target_ids_write,
-};
-
-static const struct file_operations init_regions_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_init_regions_read,
- .write = dbgfs_init_regions_write,
-};
-
-static const struct file_operations kdamond_pid_fops = {
- .open = damon_dbgfs_open,
- .read = dbgfs_kdamond_pid_read,
-};
-
-static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
-{
- const char * const file_names[] = {"attrs", "schemes", "target_ids",
- "init_regions", "kdamond_pid"};
- const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
- &target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
- int i;
-
- for (i = 0; i < ARRAY_SIZE(file_names); i++)
- debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
-}
-
-static void dbgfs_before_terminate(struct damon_ctx *ctx)
-{
- struct damon_target *t, *next;
-
- if (!damon_target_has_pid(ctx))
- return;
-
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_target_safe(t, next, ctx) {
- put_pid(t->pid);
- damon_destroy_target(t);
- }
- mutex_unlock(&ctx->kdamond_lock);
-}
-
-static struct damon_ctx *dbgfs_new_ctx(void)
-{
- struct damon_ctx *ctx;
-
- ctx = damon_new_ctx();
- if (!ctx)
- return NULL;
-
- if (damon_select_ops(ctx, DAMON_OPS_VADDR) &&
- damon_select_ops(ctx, DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- return NULL;
- }
- ctx->callback.before_terminate = dbgfs_before_terminate;
- return ctx;
-}
-
-static void dbgfs_destroy_ctx(struct damon_ctx *ctx)
-{
- damon_destroy_ctx(ctx);
-}
-
-static ssize_t damon_dbgfs_deprecated_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- static const char kbuf[512] = DAMON_DBGFS_DEPRECATION_NOTICE;
-
- return simple_read_from_buffer(buf, count, ppos, kbuf, strlen(kbuf));
-}
-
-/*
- * Make a context of @name and create a debugfs directory for it.
- *
- * This function should be called while holding damon_dbgfs_lock.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int dbgfs_mk_context(char *name)
-{
- struct dentry *root, **new_dirs, *new_dir;
- struct damon_ctx **new_ctxs, *new_ctx;
-
- if (damon_nr_running_ctxs())
- return -EBUSY;
-
- new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) *
- (dbgfs_nr_ctxs + 1), GFP_KERNEL);
- if (!new_ctxs)
- return -ENOMEM;
- dbgfs_ctxs = new_ctxs;
-
- new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) *
- (dbgfs_nr_ctxs + 1), GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
- dbgfs_dirs = new_dirs;
-
- root = dbgfs_dirs[0];
- if (!root)
- return -ENOENT;
-
- new_dir = debugfs_create_dir(name, root);
- /* Below check is required for a potential duplicated name case */
- if (IS_ERR(new_dir))
- return PTR_ERR(new_dir);
- dbgfs_dirs[dbgfs_nr_ctxs] = new_dir;
-
- new_ctx = dbgfs_new_ctx();
- if (!new_ctx) {
- debugfs_remove(new_dir);
- dbgfs_dirs[dbgfs_nr_ctxs] = NULL;
- return -ENOMEM;
- }
-
- dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx;
- dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs],
- dbgfs_ctxs[dbgfs_nr_ctxs]);
- dbgfs_nr_ctxs++;
-
- return 0;
-}
-
-static ssize_t dbgfs_mk_context_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- char *ctx_name;
- ssize_t ret;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- ctx_name = kmalloc(count + 1, GFP_KERNEL);
- if (!ctx_name) {
- kfree(kbuf);
- return -ENOMEM;
- }
-
- /* Trim white space */
- if (sscanf(kbuf, "%s", ctx_name) != 1) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- ret = dbgfs_mk_context(ctx_name);
- if (!ret)
- ret = count;
- mutex_unlock(&damon_dbgfs_lock);
-
-out:
- kfree(kbuf);
- kfree(ctx_name);
- return ret;
-}
-
-/*
- * Remove a context of @name and its debugfs directory.
- *
- * This function should be called while holding damon_dbgfs_lock.
- *
- * Return 0 on success, negative error code otherwise.
- */
-static int dbgfs_rm_context(char *name)
-{
- struct dentry *root, *dir, **new_dirs;
- struct inode *inode;
- struct damon_ctx **new_ctxs;
- int i, j;
- int ret = 0;
-
- if (damon_nr_running_ctxs())
- return -EBUSY;
-
- root = dbgfs_dirs[0];
- if (!root)
- return -ENOENT;
-
- dir = debugfs_lookup(name, root);
- if (!dir)
- return -ENOENT;
-
- inode = d_inode(dir);
- if (!S_ISDIR(inode->i_mode)) {
- ret = -EINVAL;
- goto out_dput;
- }
-
- new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
- GFP_KERNEL);
- if (!new_dirs) {
- ret = -ENOMEM;
- goto out_dput;
- }
-
- new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
- GFP_KERNEL);
- if (!new_ctxs) {
- ret = -ENOMEM;
- goto out_new_dirs;
- }
-
- for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
- if (dbgfs_dirs[i] == dir) {
- debugfs_remove(dbgfs_dirs[i]);
- dbgfs_destroy_ctx(dbgfs_ctxs[i]);
- continue;
- }
- new_dirs[j] = dbgfs_dirs[i];
- new_ctxs[j++] = dbgfs_ctxs[i];
- }
-
- kfree(dbgfs_dirs);
- kfree(dbgfs_ctxs);
-
- dbgfs_dirs = new_dirs;
- dbgfs_ctxs = new_ctxs;
- dbgfs_nr_ctxs--;
-
- goto out_dput;
-
-out_new_dirs:
- kfree(new_dirs);
-out_dput:
- dput(dir);
- return ret;
-}
-
-static ssize_t dbgfs_rm_context_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- char *kbuf;
- ssize_t ret;
- char *ctx_name;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- ctx_name = kmalloc(count + 1, GFP_KERNEL);
- if (!ctx_name) {
- kfree(kbuf);
- return -ENOMEM;
- }
-
- /* Trim white space */
- if (sscanf(kbuf, "%s", ctx_name) != 1) {
- ret = -EINVAL;
- goto out;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- ret = dbgfs_rm_context(ctx_name);
- if (!ret)
- ret = count;
- mutex_unlock(&damon_dbgfs_lock);
-
-out:
- kfree(kbuf);
- kfree(ctx_name);
- return ret;
-}
-
-static ssize_t dbgfs_monitor_on_read(struct file *file,
- char __user *buf, size_t count, loff_t *ppos)
-{
- char monitor_on_buf[5];
- bool monitor_on = damon_nr_running_ctxs() != 0;
- int len;
-
- len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n");
-
- return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len);
-}
-
-static ssize_t dbgfs_monitor_on_write(struct file *file,
- const char __user *buf, size_t count, loff_t *ppos)
-{
- ssize_t ret;
- char *kbuf;
-
- kbuf = user_input_str(buf, count, ppos);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
-
- /* Remove white space */
- if (sscanf(kbuf, "%s", kbuf) != 1) {
- kfree(kbuf);
- return -EINVAL;
- }
-
- mutex_lock(&damon_dbgfs_lock);
- if (!strncmp(kbuf, "on", count)) {
- int i;
-
- for (i = 0; i < dbgfs_nr_ctxs; i++) {
- if (damon_targets_empty(dbgfs_ctxs[i])) {
- kfree(kbuf);
- mutex_unlock(&damon_dbgfs_lock);
- return -EINVAL;
- }
- }
- ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true);
- } else if (!strncmp(kbuf, "off", count)) {
- ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
- } else {
- ret = -EINVAL;
- }
- mutex_unlock(&damon_dbgfs_lock);
-
- if (!ret)
- ret = count;
- kfree(kbuf);
- return ret;
-}
-
-static int damon_dbgfs_static_file_open(struct inode *inode, struct file *file)
-{
- damon_dbgfs_warn_deprecation();
- return nonseekable_open(inode, file);
-}
-
-static const struct file_operations deprecated_fops = {
- .read = damon_dbgfs_deprecated_read,
-};
-
-static const struct file_operations mk_contexts_fops = {
- .open = damon_dbgfs_static_file_open,
- .write = dbgfs_mk_context_write,
-};
-
-static const struct file_operations rm_contexts_fops = {
- .open = damon_dbgfs_static_file_open,
- .write = dbgfs_rm_context_write,
-};
-
-static const struct file_operations monitor_on_fops = {
- .open = damon_dbgfs_static_file_open,
- .read = dbgfs_monitor_on_read,
- .write = dbgfs_monitor_on_write,
-};
-
-static int __init __damon_dbgfs_init(void)
-{
- struct dentry *dbgfs_root;
- const char * const file_names[] = {"mk_contexts", "rm_contexts",
- "monitor_on_DEPRECATED", "DEPRECATED"};
- const struct file_operations *fops[] = {&mk_contexts_fops,
- &rm_contexts_fops, &monitor_on_fops, &deprecated_fops};
- int i;
-
- dbgfs_root = debugfs_create_dir("damon", NULL);
-
- for (i = 0; i < ARRAY_SIZE(file_names); i++)
- debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL,
- fops[i]);
- dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
-
- dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
- if (!dbgfs_dirs) {
- debugfs_remove(dbgfs_root);
- return -ENOMEM;
- }
- dbgfs_dirs[0] = dbgfs_root;
-
- return 0;
-}
-
-/*
- * Functions for the initialization
- */
-
-static int __init damon_dbgfs_init(void)
-{
- int rc = -ENOMEM;
-
- mutex_lock(&damon_dbgfs_lock);
- dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL);
- if (!dbgfs_ctxs)
- goto out;
- dbgfs_ctxs[0] = dbgfs_new_ctx();
- if (!dbgfs_ctxs[0]) {
- kfree(dbgfs_ctxs);
- goto out;
- }
- dbgfs_nr_ctxs = 1;
-
- rc = __damon_dbgfs_init();
- if (rc) {
- kfree(dbgfs_ctxs[0]);
- kfree(dbgfs_ctxs);
- pr_err("%s: dbgfs init failed\n", __func__);
- }
-
-out:
- mutex_unlock(&damon_dbgfs_lock);
- return rc;
-}
-
-module_init(damon_dbgfs_init);
-
-#include "tests/dbgfs-kunit.h"
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a9ff35341d65..c834aa217835 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -198,7 +198,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static bool __damos_pa_filter_out(struct damos_filter *filter,
+static bool damos_pa_filter_match(struct damos_filter *filter,
struct folio *folio)
{
bool matched = false;
@@ -236,14 +236,18 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
{
struct damos_filter *filter;
+ if (scheme->core_filters_allowed)
+ return false;
+
damos_for_each_filter(filter, scheme) {
- if (__damos_pa_filter_out(filter, folio))
- return true;
+ if (damos_pa_filter_match(filter, folio))
+ return !filter->allow;
}
return false;
}
-static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied;
LIST_HEAD(folio_list);
@@ -258,7 +262,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
}
}
if (install_young_filter) {
- filter = damos_new_filter(DAMOS_FILTER_TYPE_YOUNG, true);
+ filter = damos_new_filter(
+ DAMOS_FILTER_TYPE_YOUNG, true, false);
if (!filter)
return 0;
damos_add_filter(s, filter);
@@ -272,6 +277,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
folio_clear_referenced(folio);
folio_test_clear_young(folio);
@@ -292,7 +299,8 @@ put_folio:
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, struct damos *s, bool mark_accessed)
+ struct damon_region *r, struct damos *s, bool mark_accessed,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied = 0;
@@ -304,6 +312,8 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
if (mark_accessed)
folio_mark_accessed(folio);
@@ -317,15 +327,17 @@ put_folio:
}
static unsigned long damon_pa_mark_accessed(struct damon_region *r,
- struct damos *s)
+ struct damos *s, unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, true);
+ return damon_pa_mark_accessed_or_deactivate(r, s, true,
+ sz_filter_passed);
}
static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
- struct damos *s)
+ struct damos *s, unsigned long *sz_filter_passed)
{
- return damon_pa_mark_accessed_or_deactivate(r, s, false);
+ return damon_pa_mark_accessed_or_deactivate(r, s, false,
+ sz_filter_passed);
}
static unsigned int __damon_pa_migrate_folio_list(
@@ -449,7 +461,8 @@ static unsigned long damon_pa_migrate_pages(struct list_head *folio_list,
return nr_migrated;
}
-static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s)
+static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
{
unsigned long addr, applied;
LIST_HEAD(folio_list);
@@ -462,6 +475,8 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s)
if (damos_pa_filter_out(s, folio))
goto put_folio;
+ else
+ *sz_filter_passed += folio_size(folio);
if (!folio_isolate_lru(folio))
goto put_folio;
@@ -474,23 +489,57 @@ put_folio:
return applied * PAGE_SIZE;
}
+static bool damon_pa_scheme_has_filter(struct damos *s)
+{
+ struct damos_filter *f;
+
+ damos_for_each_filter(f, s)
+ return true;
+ return false;
+}
+
+static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s,
+ unsigned long *sz_filter_passed)
+{
+ unsigned long addr;
+ LIST_HEAD(folio_list);
+
+ if (!damon_pa_scheme_has_filter(s))
+ return 0;
+
+ addr = r->ar.start;
+ while (addr < r->ar.end) {
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ if (!damos_pa_filter_out(s, folio))
+ *sz_filter_passed += folio_size(folio);
+ addr += folio_size(folio);
+ folio_put(folio);
+ }
+ return 0;
+}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+ struct damos *scheme, unsigned long *sz_filter_passed)
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r, scheme);
+ return damon_pa_pageout(r, scheme, sz_filter_passed);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r, scheme);
+ return damon_pa_mark_accessed(r, scheme, sz_filter_passed);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r, scheme);
+ return damon_pa_deactivate_pages(r, scheme, sz_filter_passed);
case DAMOS_MIGRATE_HOT:
case DAMOS_MIGRATE_COLD:
- return damon_pa_migrate(r, scheme);
+ return damon_pa_migrate(r, scheme, sz_filter_passed);
case DAMOS_STAT:
- break;
+ return damon_pa_stat(r, scheme, sz_filter_passed);
default:
/* DAMOS actions that not yet supported by 'paddr'. */
break;
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 9e0077a9404e..a675150965e0 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -221,7 +221,7 @@ static int damon_reclaim_apply_parameters(void)
}
if (skip_anon) {
- filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
if (!filter)
goto out;
damos_add_filter(scheme, filter);
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 9a18f3c535d3..70d84bdc9f5f 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -45,19 +45,13 @@ void damon_sysfs_schemes_update_stats(
struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
-int damon_sysfs_schemes_update_regions_start(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx, bool total_bytes_only);
-
-void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx);
-
-bool damos_sysfs_regions_upd_done(void);
-
-int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s,
+ bool total_bytes_only, unsigned long sz_filter_passed);
int damon_sysfs_schemes_clear_regions(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx);
+ struct damon_sysfs_schemes *sysfs_schemes);
int damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index b095457380b5..98f93ae9f59e 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -19,6 +19,7 @@ struct damon_sysfs_scheme_region {
struct damon_addr_range ar;
unsigned int nr_accesses;
unsigned int age;
+ unsigned long sz_filter_passed;
struct list_head list;
};
@@ -74,6 +75,15 @@ static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
return sysfs_emit(buf, "%u\n", region->age);
}
+static ssize_t sz_filter_passed_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_region *region = container_of(kobj,
+ struct damon_sysfs_scheme_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->sz_filter_passed);
+}
+
static void damon_sysfs_scheme_region_release(struct kobject *kobj)
{
struct damon_sysfs_scheme_region *region = container_of(kobj,
@@ -95,11 +105,15 @@ static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
__ATTR_RO_MODE(age, 0400);
+static struct kobj_attribute damon_sysfs_scheme_region_sz_filter_passed_attr =
+ __ATTR_RO_MODE(sz_filter_passed, 0400);
+
static struct attribute *damon_sysfs_scheme_region_attrs[] = {
&damon_sysfs_scheme_region_start_attr.attr,
&damon_sysfs_scheme_region_end_attr.attr,
&damon_sysfs_scheme_region_nr_accesses_attr.attr,
&damon_sysfs_scheme_region_age_attr.attr,
+ &damon_sysfs_scheme_region_sz_filter_passed_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
@@ -114,55 +128,11 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = {
* scheme regions directory
*/
-/*
- * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update
- * status
- * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request.
- * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started.
- * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished.
- *
- * Each DAMON-based operation scheme (&struct damos) has its own apply
- * interval, and we need to expose the scheme tried regions based on only
- * single snapshot. For this, we keep the tried regions update status for each
- * scheme. The status becomes 'idle' at the beginning.
- *
- * Once the tried regions update request is received, the request handling
- * start function (damon_sysfs_scheme_update_regions_start()) sets the status
- * of all schemes as 'idle' again, and register ->before_damos_apply()
- * callback.
- *
- * Then, the first followup ->before_damos_apply() callback
- * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first
- * ->after_sampling() or ->after_aggregation() callback
- * (damon_sysfs_cmd_request_callback()) after the call is called only after
- * the scheme is completely applied to the given snapshot. Hence the callback
- * knows the situation by showing 'started' status, and sets the status as
- * 'finished'. Then, damon_sysfs_before_damos_apply() understands the
- * situation by showing the 'finished' status and do nothing.
- *
- * If DAMOS is not applied to any region due to any reasons including the
- * access pattern, the watermarks, the quotas, and the filters,
- * ->before_damos_apply() will not be called back. Until the situation is
- * changed, the update will not be finished. To avoid this,
- * damon_sysfs_after_sampling() set the status as 'finished' if more than two
- * apply intervals of the scheme is passed while the state is 'idle'.
- *
- * Finally, the tried regions request handling finisher function
- * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks.
- */
-enum damos_sysfs_regions_upd_status {
- DAMOS_TRIED_REGIONS_UPD_IDLE,
- DAMOS_TRIED_REGIONS_UPD_STARTED,
- DAMOS_TRIED_REGIONS_UPD_FINISHED,
-};
-
struct damon_sysfs_scheme_regions {
struct kobject kobj;
struct list_head regions_list;
int nr_regions;
unsigned long total_bytes;
- enum damos_sysfs_regions_upd_status upd_status;
- unsigned long upd_timeout_jiffies;
};
static struct damon_sysfs_scheme_regions *
@@ -178,7 +148,6 @@ damon_sysfs_scheme_regions_alloc(void)
INIT_LIST_HEAD(&regions->regions_list);
regions->nr_regions = 0;
regions->total_bytes = 0;
- regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
return regions;
}
@@ -233,6 +202,7 @@ struct damon_sysfs_stats {
unsigned long sz_tried;
unsigned long nr_applied;
unsigned long sz_applied;
+ unsigned long sz_ops_filter_passed;
unsigned long qt_exceeds;
};
@@ -277,6 +247,15 @@ static ssize_t sz_applied_show(struct kobject *kobj,
return sysfs_emit(buf, "%lu\n", stats->sz_applied);
}
+static ssize_t sz_ops_filter_passed_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_ops_filter_passed);
+}
+
static ssize_t qt_exceeds_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -303,6 +282,9 @@ static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
__ATTR_RO_MODE(sz_applied, 0400);
+static struct kobj_attribute damon_sysfs_stats_sz_ops_filter_passed_attr =
+ __ATTR_RO_MODE(sz_ops_filter_passed, 0400);
+
static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
__ATTR_RO_MODE(qt_exceeds, 0400);
@@ -311,6 +293,7 @@ static struct attribute *damon_sysfs_stats_attrs[] = {
&damon_sysfs_stats_sz_tried_attr.attr,
&damon_sysfs_stats_nr_applied_attr.attr,
&damon_sysfs_stats_sz_applied_attr.attr,
+ &damon_sysfs_stats_sz_ops_filter_passed_attr.attr,
&damon_sysfs_stats_qt_exceeds_attr.attr,
NULL,
};
@@ -330,6 +313,7 @@ struct damon_sysfs_scheme_filter {
struct kobject kobj;
enum damos_filter_type type;
bool matching;
+ bool allow;
char *memcg_path;
struct damon_addr_range addr_range;
int target_idx;
@@ -402,6 +386,30 @@ static ssize_t matching_store(struct kobject *kobj,
return count;
}
+static ssize_t allow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->allow ? 'Y' : 'N');
+}
+
+static ssize_t allow_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool allow;
+ int err = kstrtobool(buf, &allow);
+
+ if (err)
+ return err;
+
+ filter->allow = allow;
+ return count;
+}
+
static ssize_t memcg_path_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -499,6 +507,9 @@ static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
__ATTR_RW_MODE(matching, 0600);
+static struct kobj_attribute damon_sysfs_scheme_filter_allow_attr =
+ __ATTR_RW_MODE(allow, 0600);
+
static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
__ATTR_RW_MODE(memcg_path, 0600);
@@ -514,6 +525,7 @@ static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr =
static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
&damon_sysfs_scheme_filter_type_attr.attr,
&damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_allow_attr.attr,
&damon_sysfs_scheme_filter_memcg_path_attr.attr,
&damon_sysfs_scheme_filter_addr_start_attr.attr,
&damon_sysfs_scheme_filter_addr_end_attr.attr,
@@ -1918,7 +1930,8 @@ static int damon_sysfs_add_scheme_filters(struct damos *scheme,
sysfs_filters->filters_arr[i];
struct damos_filter *filter =
damos_new_filter(sysfs_filter->type,
- sysfs_filter->matching);
+ sysfs_filter->matching,
+ sysfs_filter->allow);
int err;
if (!filter)
@@ -2122,32 +2135,32 @@ void damon_sysfs_schemes_update_stats(
sysfs_stats->sz_tried = scheme->stat.sz_tried;
sysfs_stats->nr_applied = scheme->stat.nr_applied;
sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->sz_ops_filter_passed =
+ scheme->stat.sz_ops_filter_passed;
sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
}
}
-/*
- * damon_sysfs_schemes that need to update its schemes regions dir. Protected
- * by damon_sysfs_lock
- */
-static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
-static int damon_sysfs_schemes_region_idx;
-static bool damos_regions_upd_total_bytes_only;
-
-/*
- * DAMON callback that called before damos apply. While this callback is
- * registered, damon_sysfs_lock should be held to ensure the regions
- * directories exist.
+/**
+ * damos_sysfs_populate_region_dir() - Populate a schemes tried region dir.
+ * @sysfs_schemes: Schemes directory to populate regions directory.
+ * @ctx: Corresponding DAMON context.
+ * @t: DAMON target of @r.
+ * @r: DAMON region to populate the directory for.
+ * @s: Corresponding scheme.
+ * @total_bytes_only: Whether the request is for bytes update only.
+ * @sz_filter_passed: Bytes of @r that passed filters of @s.
+ *
+ * Called from DAMOS walk callback while holding damon_sysfs_lock.
*/
-static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- struct damos *s)
+void damos_sysfs_populate_region_dir(struct damon_sysfs_schemes *sysfs_schemes,
+ struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *s, bool total_bytes_only,
+ unsigned long sz_filter_passed)
{
struct damos *scheme;
struct damon_sysfs_scheme_regions *sysfs_regions;
struct damon_sysfs_scheme_region *region;
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
int schemes_idx = 0;
damon_for_each_scheme(scheme, ctx) {
@@ -2158,152 +2171,40 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
/* user could have removed the scheme sysfs dir */
if (schemes_idx >= sysfs_schemes->nr)
- return 0;
+ return;
sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
- if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED)
- return 0;
- if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE)
- sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED;
sysfs_regions->total_bytes += r->ar.end - r->ar.start;
- if (damos_regions_upd_total_bytes_only)
- return 0;
+ if (total_bytes_only)
+ return;
region = damon_sysfs_scheme_region_alloc(r);
if (!region)
- return 0;
+ return;
+ region->sz_filter_passed = sz_filter_passed;
list_add_tail(&region->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
if (kobject_init_and_add(&region->kobj,
&damon_sysfs_scheme_region_ktype,
&sysfs_regions->kobj, "%d",
- damon_sysfs_schemes_region_idx++)) {
+ sysfs_regions->nr_regions++)) {
kobject_put(&region->kobj);
}
- return 0;
-}
-
-/*
- * DAMON callback that called after each accesses sampling. While this
- * callback is registered, damon_sysfs_lock should be held to ensure the
- * regions directories exist.
- */
-void damos_sysfs_mark_finished_regions_updates(struct damon_ctx *ctx)
-{
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
- struct damon_sysfs_scheme_regions *sysfs_regions;
- int i;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- if (sysfs_regions->upd_status ==
- DAMOS_TRIED_REGIONS_UPD_STARTED ||
- time_after(jiffies,
- sysfs_regions->upd_timeout_jiffies))
- sysfs_regions->upd_status =
- DAMOS_TRIED_REGIONS_UPD_FINISHED;
- }
}
/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
int damon_sysfs_schemes_clear_regions(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
+ struct damon_sysfs_schemes *sysfs_schemes)
{
- struct damos *scheme;
- int schemes_idx = 0;
+ int i;
- damon_for_each_scheme(scheme, ctx) {
+ for (i = 0; i < sysfs_schemes->nr; i++) {
struct damon_sysfs_scheme *sysfs_scheme;
- /* user could have removed the scheme sysfs dir */
- if (schemes_idx >= sysfs_schemes->nr)
- break;
-
- sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+ sysfs_scheme = sysfs_schemes->schemes_arr[i];
damon_sysfs_scheme_regions_rm_dirs(
sysfs_scheme->tried_regions);
sysfs_scheme->tried_regions->total_bytes = 0;
}
return 0;
}
-
-static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx)
-{
- struct damos *scheme;
- int i = 0;
-
- damon_for_each_scheme(scheme, ctx) {
- if (i == n)
- return scheme;
- i++;
- }
- return NULL;
-}
-
-static void damos_tried_regions_init_upd_status(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx)
-{
- int i;
- struct damos *scheme;
- struct damon_sysfs_scheme_regions *sysfs_regions;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- scheme = damos_sysfs_nth_scheme(i, ctx);
- if (!scheme) {
- sysfs_regions->upd_status =
- DAMOS_TRIED_REGIONS_UPD_FINISHED;
- continue;
- }
- sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
- sysfs_regions->upd_timeout_jiffies = jiffies +
- 2 * usecs_to_jiffies(scheme->apply_interval_us ?
- scheme->apply_interval_us :
- ctx->attrs.aggr_interval);
- }
-}
-
-/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_update_regions_start(
- struct damon_sysfs_schemes *sysfs_schemes,
- struct damon_ctx *ctx, bool total_bytes_only)
-{
- damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
- damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
- damos_tried_regions_init_upd_status(sysfs_schemes, ctx);
- damos_regions_upd_total_bytes_only = total_bytes_only;
- ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
- return 0;
-}
-
-bool damos_sysfs_regions_upd_done(void)
-{
- struct damon_sysfs_schemes *sysfs_schemes =
- damon_sysfs_schemes_for_damos_callback;
- struct damon_sysfs_scheme_regions *sysfs_regions;
- int i;
-
- for (i = 0; i < sysfs_schemes->nr; i++) {
- sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
- if (sysfs_regions->upd_status !=
- DAMOS_TRIED_REGIONS_UPD_FINISHED)
- return false;
- }
- return true;
-}
-
-/*
- * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller
- * should unlock damon_sysfs_lock which held before
- * damon_sysfs_schemes_update_regions_start()
- */
-int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
-{
- damon_sysfs_schemes_for_damos_callback = NULL;
- ctx->callback.before_damos_apply = NULL;
- damon_sysfs_schemes_region_idx = 0;
- return 0;
-}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 58145d59881d..deeab04d3b46 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1181,25 +1181,9 @@ static int damon_sysfs_add_targets(struct damon_ctx *ctx,
return 0;
}
-static bool damon_sysfs_schemes_regions_updating;
-
static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- struct damon_sysfs_kdamond *kdamond;
- enum damon_sysfs_cmd cmd;
-
- /* damon_sysfs_schemes_update_regions_stop() might not yet called */
- kdamond = damon_sysfs_cmd_request.kdamond;
- cmd = damon_sysfs_cmd_request.cmd;
- if (kdamond && ctx == kdamond->damon_ctx &&
- (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
- cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) &&
- damon_sysfs_schemes_regions_updating) {
- damon_sysfs_schemes_update_regions_stop(ctx);
- damon_sysfs_schemes_regions_updating = false;
- mutex_unlock(&damon_sysfs_lock);
- }
if (!damon_target_has_pid(ctx))
return;
@@ -1214,57 +1198,24 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
/*
* damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ * @data: The kobject wrapper that associated to the kdamond thread.
*
* This function reads the schemes stats of specific kdamond and update the
* related values for sysfs files. This function should be called from DAMON
- * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
- * contexts-internal data and DAMON sysfs variables.
+ * worker thread,to safely access the DAMON contexts-internal data. Caller
+ * should also ensure holding ``damon_syfs_lock``, and ->damon_ctx of @data is
+ * not NULL but a valid pointer, to safely access DAMON sysfs variables.
*/
-static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_upd_schemes_stats(void *data)
{
+ struct damon_sysfs_kdamond *kdamond = data;
struct damon_ctx *ctx = kdamond->damon_ctx;
- if (!ctx)
- return -EINVAL;
damon_sysfs_schemes_update_stats(
kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
}
-static int damon_sysfs_upd_schemes_regions_start(
- struct damon_sysfs_kdamond *kdamond, bool total_bytes_only)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_update_regions_start(
- kdamond->contexts->contexts_arr[0]->schemes, ctx,
- total_bytes_only);
-}
-
-static int damon_sysfs_upd_schemes_regions_stop(
- struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_update_regions_stop(ctx);
-}
-
-static int damon_sysfs_clear_schemes_regions(
- struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
-
- if (!ctx)
- return -EINVAL;
- return damon_sysfs_schemes_clear_regions(
- kdamond->contexts->contexts_arr[0]->schemes, ctx);
-}
-
static inline bool damon_sysfs_kdamond_running(
struct damon_sysfs_kdamond *kdamond)
{
@@ -1318,9 +1269,9 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
return err;
}
-static int damon_sysfs_commit_schemes_quota_goals(
- struct damon_sysfs_kdamond *sysfs_kdamond)
+static int damon_sysfs_commit_schemes_quota_goals(void *data)
{
+ struct damon_sysfs_kdamond *sysfs_kdamond = data;
struct damon_ctx *ctx;
struct damon_sysfs_context *sysfs_ctx;
@@ -1338,20 +1289,18 @@ static int damon_sysfs_commit_schemes_quota_goals(
/*
* damon_sysfs_upd_schemes_effective_quotas() - Update schemes effective quotas
* sysfs files.
- * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ * @data: The kobject wrapper that associated to the kdamond thread.
*
* This function reads the schemes' effective quotas of specific kdamond and
* update the related values for sysfs files. This function should be called
* from DAMON callbacks while holding ``damon_syfs_lock``, to safely access the
* DAMON contexts-internal data and DAMON sysfs variables.
*/
-static int damon_sysfs_upd_schemes_effective_quotas(
- struct damon_sysfs_kdamond *kdamond)
+static int damon_sysfs_upd_schemes_effective_quotas(void *data)
{
+ struct damon_sysfs_kdamond *kdamond = data;
struct damon_ctx *ctx = kdamond->damon_ctx;
- if (!ctx)
- return -EINVAL;
damos_sysfs_update_effective_quotas(
kdamond->contexts->contexts_arr[0]->schemes, ctx);
return 0;
@@ -1371,67 +1320,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active,
bool after_aggregation)
{
struct damon_sysfs_kdamond *kdamond;
- bool total_bytes_only = false;
int err = 0;
/* avoid deadlock due to concurrent state_store('off') */
- if (!damon_sysfs_schemes_regions_updating &&
- !mutex_trylock(&damon_sysfs_lock))
+ if (!mutex_trylock(&damon_sysfs_lock))
return 0;
kdamond = damon_sysfs_cmd_request.kdamond;
if (!kdamond || kdamond->damon_ctx != c)
goto out;
switch (damon_sysfs_cmd_request.cmd) {
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
- err = damon_sysfs_upd_schemes_stats(kdamond);
- break;
case DAMON_SYSFS_CMD_COMMIT:
if (!after_aggregation)
goto out;
err = damon_sysfs_commit_input(kdamond);
break;
- case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
- err = damon_sysfs_commit_schemes_quota_goals(kdamond);
- break;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
- total_bytes_only = true;
- fallthrough;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
- if (!damon_sysfs_schemes_regions_updating) {
- err = damon_sysfs_upd_schemes_regions_start(kdamond,
- total_bytes_only);
- if (!err) {
- damon_sysfs_schemes_regions_updating = true;
- goto keep_lock_out;
- }
- } else {
- damos_sysfs_mark_finished_regions_updates(c);
- /*
- * Continue regions updating if DAMON is till
- * active and the update for all schemes is not
- * finished.
- */
- if (active && !damos_sysfs_regions_upd_done())
- goto keep_lock_out;
- err = damon_sysfs_upd_schemes_regions_stop(kdamond);
- damon_sysfs_schemes_regions_updating = false;
- }
- break;
- case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
- err = damon_sysfs_clear_schemes_regions(kdamond);
- break;
- case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
- err = damon_sysfs_upd_schemes_effective_quotas(kdamond);
- break;
default:
break;
}
/* Mark the request as invalid now. */
damon_sysfs_cmd_request.kdamond = NULL;
out:
- if (!damon_sysfs_schemes_regions_updating)
- mutex_unlock(&damon_sysfs_lock);
-keep_lock_out:
+ mutex_unlock(&damon_sysfs_lock);
return err;
}
@@ -1525,6 +1434,58 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
*/
}
+static int damon_sysfs_damon_call(int (*fn)(void *data),
+ struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_call_control call_control = {};
+
+ if (!kdamond->damon_ctx)
+ return -EINVAL;
+ call_control.fn = fn;
+ call_control.data = kdamond;
+ return damon_call(kdamond->damon_ctx, &call_control);
+}
+
+struct damon_sysfs_schemes_walk_data {
+ struct damon_sysfs_kdamond *sysfs_kdamond;
+ bool total_bytes_only;
+};
+
+/* populate the region directory */
+static void damon_sysfs_schemes_tried_regions_upd_one(void *data, struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *s, unsigned long sz_filter_passed)
+{
+ struct damon_sysfs_schemes_walk_data *walk_data = data;
+ struct damon_sysfs_kdamond *sysfs_kdamond = walk_data->sysfs_kdamond;
+
+ damos_sysfs_populate_region_dir(
+ sysfs_kdamond->contexts->contexts_arr[0]->schemes,
+ ctx, t, r, s, walk_data->total_bytes_only,
+ sz_filter_passed);
+}
+
+static int damon_sysfs_update_schemes_tried_regions(
+ struct damon_sysfs_kdamond *sysfs_kdamond, bool total_bytes_only)
+{
+ struct damon_sysfs_schemes_walk_data walk_data = {
+ .sysfs_kdamond = sysfs_kdamond,
+ .total_bytes_only = total_bytes_only,
+ };
+ struct damos_walk_control control = {
+ .walk_fn = damon_sysfs_schemes_tried_regions_upd_one,
+ .data = &walk_data,
+ };
+ struct damon_ctx *ctx = sysfs_kdamond->damon_ctx;
+
+ if (!ctx)
+ return -EINVAL;
+
+ damon_sysfs_schemes_clear_regions(
+ sysfs_kdamond->contexts->contexts_arr[0]->schemes);
+ return damos_walk(ctx, &control);
+}
+
/*
* damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
* @cmd: The command to handle.
@@ -1543,12 +1504,29 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
{
bool need_wait = true;
- /* Handle commands that doesn't access DAMON context-internal data */
switch (cmd) {
case DAMON_SYSFS_CMD_ON:
return damon_sysfs_turn_damon_on(kdamond);
case DAMON_SYSFS_CMD_OFF:
return damon_sysfs_turn_damon_off(kdamond);
+ case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_commit_schemes_quota_goals,
+ kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_schemes_stats, kdamond);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
+ return damon_sysfs_update_schemes_tried_regions(kdamond, true);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+ return damon_sysfs_update_schemes_tried_regions(kdamond, false);
+ case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+ return damon_sysfs_schemes_clear_regions(
+ kdamond->contexts->contexts_arr[0]->schemes);
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS:
+ return damon_sysfs_damon_call(
+ damon_sysfs_upd_schemes_effective_quotas,
+ kdamond);
default:
break;
}
diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig
index a73be044fc9b..36a450f57b58 100644
--- a/mm/damon/tests/.kunitconfig
+++ b/mm/damon/tests/.kunitconfig
@@ -13,10 +13,3 @@ CONFIG_DAMON_VADDR_KUNIT_TEST=y
CONFIG_SYSFS=y
CONFIG_DAMON_SYSFS=y
CONFIG_DAMON_SYSFS_KUNIT_TEST=y
-
-# for DAMON debugfs interface
-CONFIG_DEBUG_FS=y
-CONFIG_DAMON_PADDR=y
-CONFIG_DAMON_DBGFS_DEPRECATED=y
-CONFIG_DAMON_DBGFS=y
-CONFIG_DAMON_DBGFS_KUNIT_TEST=y
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index cf22e09a3507..532c6a6f21f9 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -411,7 +411,7 @@ static void damos_test_new_filter(struct kunit *test)
{
struct damos_filter *filter;
- filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
KUNIT_EXPECT_EQ(test, filter->matching, true);
KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
@@ -425,7 +425,7 @@ static void damos_test_filter_out(struct kunit *test)
struct damon_region *r, *r2;
struct damos_filter *f;
- f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true);
+ f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false);
f->addr_range = (struct damon_addr_range){
.start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6};
@@ -434,25 +434,25 @@ static void damos_test_filter_out(struct kunit *test)
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 2;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
r->ar.start = DAMON_MIN_REGION * 6;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
r->ar.start = DAMON_MIN_REGION * 1;
r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
@@ -465,7 +465,7 @@ static void damos_test_filter_out(struct kunit *test)
/* region started in the range */
r->ar.start = DAMON_MIN_REGION * 2;
r->ar.end = DAMON_MIN_REGION * 8;
- KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f));
/* filter should have split the region */
KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h
deleted file mode 100644
index 087e53f641a8..000000000000
--- a/mm/damon/tests/dbgfs-kunit.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * DAMON Debugfs Interface Unit Tests
- *
- * Author: SeongJae Park <sj@kernel.org>
- */
-
-#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST
-
-#ifndef _DAMON_DBGFS_TEST_H
-#define _DAMON_DBGFS_TEST_H
-
-#include <kunit/test.h>
-
-static void damon_dbgfs_test_str_to_ints(struct kunit *test)
-{
- char *question;
- int *answers;
- int expected[] = {12, 35, 46};
- ssize_t nr_integers = 0, i;
-
- question = "123";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123, answers[0]);
- kfree(answers);
-
- question = "123abc";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123, answers[0]);
- kfree(answers);
-
- question = "a123";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-
- question = "12 35";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
- for (i = 0; i < nr_integers; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "12 35 46";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
- for (i = 0; i < nr_integers; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "12 35 abc 46";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
- for (i = 0; i < 2; i++)
- KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
- kfree(answers);
-
- question = "";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-
- question = "\n";
- answers = str_to_ints(question, strlen(question), &nr_integers);
- KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
- kfree(answers);
-}
-
-static void damon_dbgfs_test_set_targets(struct kunit *test)
-{
- struct damon_ctx *ctx = dbgfs_new_ctx();
- char buf[64];
-
- if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
- dbgfs_destroy_ctx(ctx);
- kunit_skip(test, "PADDR not registered");
- }
-
- /* Make DAMON consider target has no pid */
- damon_select_ops(ctx, DAMON_OPS_PADDR);
-
- dbgfs_set_targets(ctx, 0, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
-
- dbgfs_set_targets(ctx, 1, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
-
- dbgfs_set_targets(ctx, 0, NULL);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
-
- dbgfs_destroy_ctx(ctx);
-}
-
-static void damon_dbgfs_test_set_init_regions(struct kunit *test)
-{
- struct damon_ctx *ctx = damon_new_ctx();
- /* Each line represents one region in ``<target idx> <start> <end>`` */
- char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45",
- "1 10 20\n",
- "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n",
- ""};
- /* Reading the file again will show sorted, clean output */
- char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
- "1 10 20\n",
- "0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
- ""};
- char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */
- "1 10 20\n 1 14 26\n", /* regions overlap */
- "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */
- char *input, *expect;
- int i, rc;
- char buf[256];
-
- if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
- damon_destroy_ctx(ctx);
- kunit_skip(test, "PADDR not registered");
- }
-
- damon_select_ops(ctx, DAMON_OPS_PADDR);
-
- dbgfs_set_targets(ctx, 3, NULL);
-
- /* Put valid inputs and check the results */
- for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
- input = valid_inputs[i];
- expect = valid_expects[i];
-
- rc = set_init_regions(ctx, input, strnlen(input, 256));
- KUNIT_EXPECT_EQ(test, rc, 0);
-
- memset(buf, 0, 256);
- sprint_init_regions(ctx, buf, 256);
-
- KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
- }
- /* Put invalid inputs and check the return error code */
- for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
- input = invalid_inputs[i];
- pr_info("input: %s\n", input);
- rc = set_init_regions(ctx, input, strnlen(input, 256));
- KUNIT_EXPECT_EQ(test, rc, -EINVAL);
-
- memset(buf, 0, 256);
- sprint_init_regions(ctx, buf, 256);
-
- KUNIT_EXPECT_STREQ(test, (char *)buf, "");
- }
-
- dbgfs_set_targets(ctx, 0, NULL);
- damon_destroy_ctx(ctx);
-}
-
-static struct kunit_case damon_test_cases[] = {
- KUNIT_CASE(damon_dbgfs_test_str_to_ints),
- KUNIT_CASE(damon_dbgfs_test_set_targets),
- KUNIT_CASE(damon_dbgfs_test_set_init_regions),
- {},
-};
-
-static struct kunit_suite damon_test_suite = {
- .name = "damon-dbgfs",
- .test_cases = damon_test_cases,
-};
-kunit_test_suite(damon_test_suite);
-
-#endif /* _DAMON_DBGFS_TEST_H */
-
-#endif /* CONFIG_DAMON_KUNIT_TEST */
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index b9fe3bc8472b..7cd944266a92 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
static struct mm_struct mm;
struct damon_addr_range regions[3] = {0};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
- struct vm_area_struct vmas[] = {
+ static struct vm_area_struct vmas[] = {
(struct vm_area_struct) {.vm_start = 10, .vm_end = 20},
(struct vm_area_struct) {.vm_start = 20, .vm_end = 25},
(struct vm_area_struct) {.vm_start = 200, .vm_end = 210},
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b9eaa20b73b9..a6174f725bd7 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -655,7 +655,7 @@ static unsigned long damos_madvise(struct damon_target *target,
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+ struct damos *scheme, unsigned long *sz_filter_passed)
{
int madv_action;
diff --git a/mm/debug.c b/mm/debug.c
index 95b6ab809c0e..8d2acf432385 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -249,6 +249,77 @@ void dump_mm(const struct mm_struct *mm)
}
EXPORT_SYMBOL(dump_mm);
+void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
+{
+ if (reason)
+ pr_warn("vmg %px dumped because: %s\n", vmg, reason);
+
+ if (!vmg) {
+ pr_warn("vmg %px state: (NULL)\n", vmg);
+ return;
+ }
+
+ pr_warn("vmg %px state: mm %px pgoff %lx\n"
+ "vmi %px [%lx,%lx)\n"
+ "prev %px next %px vma %px\n"
+ "start %lx end %lx flags %lx\n"
+ "file %px anon_vma %px policy %px\n"
+ "uffd_ctx %px\n"
+ "anon_name %px\n"
+ "merge_flags %x state %x\n",
+ vmg, vmg->mm, vmg->pgoff,
+ vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
+ vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
+ vmg->prev, vmg->next, vmg->vma,
+ vmg->start, vmg->end, vmg->flags,
+ vmg->file, vmg->anon_vma, vmg->policy,
+#ifdef CONFIG_USERFAULTFD
+ vmg->uffd_ctx.ctx,
+#else
+ (void *)0,
+#endif
+ vmg->anon_name,
+ (int)vmg->merge_flags, (int)vmg->state);
+
+ if (vmg->mm) {
+ pr_warn("vmg %px mm:\n", vmg);
+ dump_mm(vmg->mm);
+ } else {
+ pr_warn("vmg %px mm: (NULL)\n", vmg);
+ }
+
+ if (vmg->vma) {
+ pr_warn("vmg %px vma:\n", vmg);
+ dump_vma(vmg->vma);
+ } else {
+ pr_warn("vmg %px vma: (NULL)\n", vmg);
+ }
+
+ if (vmg->prev) {
+ pr_warn("vmg %px prev:\n", vmg);
+ dump_vma(vmg->prev);
+ } else {
+ pr_warn("vmg %px prev: (NULL)\n", vmg);
+ }
+
+ if (vmg->next) {
+ pr_warn("vmg %px next:\n", vmg);
+ dump_vma(vmg->next);
+ } else {
+ pr_warn("vmg %px next: (NULL)\n", vmg);
+ }
+
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ if (vmg->vmi) {
+ pr_warn("vmg %px vmi:\n", vmg);
+ vma_iter_dump_tree(vmg->vmi);
+ } else {
+ pr_warn("vmg %px vmi: (NULL)\n", vmg);
+ }
+#endif
+}
+EXPORT_SYMBOL(dump_vmg);
+
static bool page_init_poisoning __read_mostly = true;
static int __init setup_vm_debug(char *str)
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index ce06b2884789..ff35b84a7b50 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -245,7 +245,10 @@ early_memremap_prot(resource_size_t phys_addr, unsigned long size,
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
-void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+/*
+ * If no empty slot, handle that and return -ENOMEM.
+ */
+int __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
{
unsigned long slop, clen;
char *p;
@@ -256,12 +259,15 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
if (clen > MAX_MAP_CHUNK - slop)
clen = MAX_MAP_CHUNK - slop;
p = early_memremap(src & PAGE_MASK, clen + slop);
+ if (!p)
+ return -ENOMEM;
memcpy(dest, p + slop, clen);
early_memunmap(p, clen + slop);
dest += clen;
src += clen;
size -= clen;
}
+ return 0;
}
#else /* CONFIG_MMU */
diff --git a/mm/execmem.c b/mm/execmem.c
index 317b6a8d35be..e6c4f5076ca8 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -257,7 +257,6 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
- unsigned long start, end;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- start = (unsigned long)p;
- end = start + alloc_size;
-
- vunmap_range(start, end);
-
- err = execmem_set_direct_map_valid(vm, false);
- if (err)
- goto err_free_mem;
-
- err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
- PMD_SHIFT);
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)
- goto err_free_mem;
+ goto err_reset_direct_map;
return 0;
+err_reset_direct_map:
+ execmem_set_direct_map_valid(vm, true);
err_free_mem:
vfree(p);
return err;
@@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr)
return true;
}
+
+int execmem_make_temp_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 4f476411a9a2..e9404290f2c6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,6 +441,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
+ * filemap_fdatawrite_range_kick - start writeback on a range
+ * @mapping: target address_space
+ * @start: index to start writeback on
+ * @end: last (inclusive) index for writeback
+ *
+ * This is a non-integrity writeback helper, to start writing back folios
+ * for the indicated range.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+ loff_t end)
+{
+ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+}
+EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
+
+/**
* filemap_flush - mostly a non-blocking flush
* @mapping: target address_space
*
@@ -1464,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state)
}
/**
- * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
- * @folio: Folio defining the wait queue of interest
- * @waiter: Waiter to add to the queue
- *
- * Add an arbitrary @waiter to the wait queue for the nominated @folio.
- */
-void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
-{
- wait_queue_head_t *q = folio_waitqueue(folio);
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_entry_tail(q, waiter);
- folio_set_waiters(folio);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(folio_add_wait_queue);
-
-/**
* folio_unlock - Unlock a locked folio.
* @folio: The folio.
*
@@ -1590,6 +1589,27 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+/*
+ * If folio was marked as dropbehind, then pages should be dropped when writeback
+ * completes. Do that now. If we fail, it's likely because of a big folio -
+ * just reset dropbehind for that case and latter completions should invalidate.
+ */
+static void folio_end_dropbehind_write(struct folio *folio)
+{
+ /*
+ * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
+ * but can happen if normal writeback just happens to find dirty folios
+ * that were created as part of uncached writeback, and that writeback
+ * would otherwise not need non-IRQ handling. Just skip the
+ * invalidation in that case.
+ */
+ if (in_task() && folio_trylock(folio)) {
+ if (folio->mapping)
+ folio_unmap_invalidate(folio->mapping, folio, 0);
+ folio_unlock(folio);
+ }
+}
+
/**
* folio_end_writeback - End writeback against a folio.
* @folio: The folio.
@@ -1600,6 +1620,8 @@ EXPORT_SYMBOL(folio_wait_private_2_killable);
*/
void folio_end_writeback(struct folio *folio)
{
+ bool folio_dropbehind = false;
+
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1621,9 +1643,14 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
+ if (!folio_test_dirty(folio))
+ folio_dropbehind = folio_test_clear_dropbehind(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
acct_reclaim_writeback(folio);
+
+ if (folio_dropbehind)
+ folio_end_dropbehind_write(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -1946,6 +1973,8 @@ no_page:
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
__folio_set_referenced(folio);
+ if (fgp_flags & FGP_DONTCACHE)
+ __folio_set_dropbehind(folio);
err = filemap_add_folio(mapping, folio, index, gfp);
if (!err)
@@ -1956,8 +1985,19 @@ no_page:
if (err == -EEXIST)
goto repeat;
- if (err)
+ if (err) {
+ /*
+ * When NOWAIT I/O fails to allocate folios this could
+ * be due to a nonblocking memory allocation and not
+ * because the system actually is out of memory.
+ * Return -EAGAIN so that there caller retries in a
+ * blocking fashion instead of propagating -ENOMEM
+ * to the application.
+ */
+ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
+ err = -EAGAIN;
return ERR_PTR(err);
+ }
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
@@ -1968,6 +2008,9 @@ no_page:
if (!folio)
return ERR_PTR(-ENOENT);
+ /* not an uncached lookup, clear uncached if set */
+ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
+ folio_clear_dropbehind(folio);
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);
@@ -2450,18 +2493,22 @@ unlock_mapping:
return error;
}
-static int filemap_create_folio(struct file *file,
- struct address_space *mapping, loff_t pos,
- struct folio_batch *fbatch)
+static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
struct folio *folio;
int error;
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t index;
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+ return -EAGAIN;
+
folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
if (!folio)
return -ENOMEM;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ __folio_set_dropbehind(folio);
/*
* Protect against truncate / hole punch. Grabbing invalidate_lock
@@ -2477,7 +2524,7 @@ static int filemap_create_folio(struct file *file,
* well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
- index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
+ index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
@@ -2485,7 +2532,8 @@ static int filemap_create_folio(struct file *file,
if (error)
goto error;
- error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
+ error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
+ folio);
if (error)
goto error;
@@ -2506,6 +2554,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file,
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ ractl.dropbehind = 1;
page_cache_async_ra(&ractl, folio, last_index - folio->index);
return 0;
}
@@ -2515,7 +2565,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
- struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
@@ -2530,20 +2579,21 @@ retry:
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
if (!folio_batch_count(fbatch)) {
+ DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);
+
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
if (iocb->ki_flags & IOCB_NOWAIT)
flags = memalloc_noio_save();
- page_cache_sync_readahead(mapping, ra, filp, index,
- last_index - index);
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ ractl.dropbehind = 1;
+ page_cache_sync_ra(&ractl, last_index - index);
if (iocb->ki_flags & IOCB_NOWAIT)
memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
- return -EAGAIN;
- err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
+ err = filemap_create_folio(iocb, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
@@ -2584,6 +2634,20 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
+static void filemap_end_dropbehind_read(struct address_space *mapping,
+ struct folio *folio)
+{
+ if (!folio_test_dropbehind(folio))
+ return;
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (folio_trylock(folio)) {
+ if (folio_test_clear_dropbehind(folio))
+ folio_unmap_invalidate(mapping, folio, 0);
+ folio_unlock(folio);
+ }
+}
+
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
@@ -2697,8 +2761,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
}
}
put_folios:
- for (i = 0; i < folio_batch_count(&fbatch); i++)
- folio_put(fbatch.folios[i]);
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ filemap_end_dropbehind_read(mapping, folio);
+ folio_put(folio);
+ }
folio_batch_init(&fbatch);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
@@ -2839,8 +2907,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
- while (spliced < size &&
- !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (spliced < size && !pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
@@ -2897,7 +2964,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -2957,7 +3024,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
goto out;
}
@@ -4027,17 +4094,6 @@ retry:
bytes = min(chunk - offset, bytes);
balance_dirty_pages_ratelimited(mapping);
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
- status = -EFAULT;
- break;
- }
-
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
@@ -4055,6 +4111,12 @@ retry:
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
+ /*
+ * Faults here on mmap()s can recurse into arbitrary
+ * filesystem code. Lots of locks are held that can
+ * deadlock. Use an atomic copy to avoid deadlocking
+ * in page fault handling.
+ */
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
flush_dcache_folio(folio);
@@ -4080,6 +4142,16 @@ retry:
bytes = copied;
goto retry;
}
+
+ /*
+ * 'folio' is now unlocked and faults on it can be
+ * handled. Ensure forward progress by trying to
+ * fault it in now.
+ */
+ if (fault_in_iov_iter_readable(i, bytes) == bytes) {
+ status = -EFAULT;
+ break;
+ }
} else {
pos += status;
written += status;
@@ -4376,6 +4448,20 @@ resched:
}
/*
+ * See mincore: reveal pagecache information only for files
+ * that the calling process has write access to, or could (if
+ * tried) open for writing.
+ */
+static inline bool can_do_cachestat(struct file *f)
+{
+ if (f->f_mode & FMODE_WRITE)
+ return true;
+ if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
+ return true;
+ return file_permission(f, MAY_WRITE) == 0;
+}
+
+/*
* The cachestat(2) system call.
*
* cachestat() returns the page cache statistics of a file in the
@@ -4430,6 +4516,9 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
if (is_file_hugepages(fd_file(f)))
return -EOPNOTSUPP;
+ if (!can_do_cachestat(fd_file(f)))
+ return -EPERM;
+
if (flags != 0)
return -EINVAL;
diff --git a/mm/gup.c b/mm/gup.c
index 3b75e631f369..855ab860f88b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -596,6 +596,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
}
#endif /* CONFIG_HAVE_GUP_FAST */
+/* Common code for can_follow_write_* */
+static inline bool can_follow_write_common(struct page *page,
+ struct vm_area_struct *vma, unsigned int flags)
+{
+ /* Maybe FOLL_FORCE is set to override it? */
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /* But FOLL_FORCE has no effect on shared mappings */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return false;
+
+ /* ... or read-only private ones */
+ if (!(vma->vm_flags & VM_MAYWRITE))
+ return false;
+
+ /* ... or already writable ones that just need to take a write fault */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page ...
+ */
+ return page && PageAnon(page) && PageAnonExclusive(page);
+}
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags, unsigned long address)
{
@@ -622,6 +649,18 @@ static struct page *no_page_table(struct vm_area_struct *vma,
}
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
+static inline bool can_follow_write_pud(pud_t pud, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /* If the pud is writable, we can write to the page. */
+ if (pud_write(pud))
+ return true;
+
+ return can_follow_write_common(page, vma, flags);
+}
+
static struct page *follow_huge_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp,
int flags, struct follow_page_context *ctx)
@@ -634,10 +673,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
assert_spin_locked(pud_lockptr(mm, pudp));
- if ((flags & FOLL_WRITE) && !pud_write(pud))
+ if (!pud_present(pud))
return NULL;
- if (!pud_present(pud))
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
return NULL;
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
@@ -686,27 +726,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
if (pmd_write(pmd))
return true;
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;
/* ... and a write-fault isn't required for other reasons. */
@@ -807,27 +827,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
if (pte_write(pte))
return true;
- /* Maybe FOLL_FORCE is set to override it? */
- if (!(flags & FOLL_FORCE))
- return false;
-
- /* But FOLL_FORCE has no effect on shared mappings */
- if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
- return false;
-
- /* ... or read-only private ones */
- if (!(vma->vm_flags & VM_MAYWRITE))
- return false;
-
- /* ... or already writable ones that just need to take a write fault */
- if (vma->vm_flags & VM_WRITE)
- return false;
-
- /*
- * See can_change_pte_writable(): we broke COW and could map the page
- * writable if we have an exclusive anonymous page ...
- */
- if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ if (!can_follow_write_common(page, vma, flags))
return false;
/* ... and a write-fault isn't required for other reasons. */
@@ -1294,9 +1294,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
- /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
- if (is_vm_hugetlb_page(vma))
- return -EFAULT;
/*
* We used to let the write,force case do COW in a
* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
@@ -2257,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2269,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2323,13 +2320,13 @@ static void pofs_unpin(struct pages_or_folios *pofs)
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
-static unsigned long collect_longterm_unpinnable_folios(
+static void collect_longterm_unpinnable_folios(
struct list_head *movable_folio_list,
struct pages_or_folios *pofs)
{
- unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
+ unsigned long i;
for (i = 0; i < pofs->nr_entries; i++) {
struct folio *folio = pofs_get_folio(pofs, i);
@@ -2341,13 +2338,11 @@ static unsigned long collect_longterm_unpinnable_folios(
if (folio_is_longterm_pinnable(folio))
continue;
- collected++;
-
if (folio_is_device_coherent(folio))
continue;
if (folio_test_hugetlb(folio)) {
- isolate_hugetlb(folio, movable_folio_list);
+ folio_isolate_hugetlb(folio, movable_folio_list);
continue;
}
@@ -2364,8 +2359,6 @@ static unsigned long collect_longterm_unpinnable_folios(
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
-
- return collected;
}
/*
@@ -2442,11 +2435,9 @@ static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
LIST_HEAD(movable_folio_list);
- unsigned long collected;
- collected = collect_longterm_unpinnable_folios(&movable_folio_list,
- pofs);
- if (!collected)
+ collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
+ if (list_empty(&movable_folio_list))
return 0;
return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
@@ -3360,8 +3351,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
return 0;
if (gup_flags & FOLL_PIN) {
- seq = raw_read_seqcount(&current->mm->write_protect_seq);
- if (seq & 1)
+ if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index db64116a4f84..373781b21e5c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
+DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = {
#ifndef CONFIG_SHMEM
&zswpout_attr.attr,
&swpin_attr.attr,
+ &swpin_fallback_attr.attr,
+ &swpin_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
&zswpout_attr.attr,
&swpin_attr.attr,
+ &swpin_fallback_attr.attr,
+ &swpin_fallback_charge_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -2003,7 +2009,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
spin_unlock(vmf->ptl);
writable = false;
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
@@ -3284,7 +3290,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = folio_lruvec_lock(folio);
- ClearPageHasHWPoisoned(head);
+ folio_clear_has_hwpoisoned(folio);
for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
struct folio *tail;
@@ -3298,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
folio_account_cleaned(tail,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
- folio_put(tail);
+ folio_put_refs(tail, folio_nr_pages(tail));
} else if (!folio_test_anon(folio)) {
__xa_store(&folio->mapping->i_pages, tail->index,
tail, 0);
@@ -4175,20 +4181,21 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
if (input_buf[0] == '/') {
char *tok;
- char *buf = input_buf;
+ char *tok_buf = input_buf;
char file_path[MAX_INPUT_BUF_SZ];
pgoff_t off_start = 0, off_end = 0;
size_t input_len = strlen(input_buf);
- tok = strsep(&buf, ",");
- if (tok && buf) {
+ tok = strsep(&tok_buf, ",");
+ if (tok && tok_buf) {
strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
}
- ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
+ ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start,
+ &off_end, &new_order);
if (ret != 2 && ret != 3) {
ret = -EINVAL;
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eaaec19caa7c..318624c96584 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -48,6 +48,7 @@
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
+#include <linux/page-isolation.h>
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -1246,69 +1247,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
hugetlb_dup_vma_private(vma);
}
-/* Returns true if the VMA has associated reserve pages */
-static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
-{
- if (vma->vm_flags & VM_NORESERVE) {
- /*
- * This address is already reserved by other process(chg == 0),
- * so, we should decrement reserved count. Without decrementing,
- * reserve count remains after releasing inode, because this
- * allocated page will go into page cache and is regarded as
- * coming from reserved pool in releasing step. Currently, we
- * don't have any other solution to deal with this situation
- * properly, so add work-around here.
- */
- if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return true;
- else
- return false;
- }
-
- /* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE) {
- /*
- * We know VM_NORESERVE is not set. Therefore, there SHOULD
- * be a region map for all pages. The only situation where
- * there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reserves to
- * use. This situation is indicated if chg != 0.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Like the shared case above, a hole punch or truncate
- * could have been performed on the private mapping.
- * Examine the value of chg to determine if reserves
- * actually exist or were previously consumed.
- * Very Subtle - The value of chg comes from a previous
- * call to vma_needs_reserves(). The reserve map for
- * private mappings has different (opposite) semantics
- * than that of shared mappings. vma_needs_reserves()
- * has already taken this difference in semantics into
- * account. Therefore, the meaning of chg is the same
- * as in the shared case above. Code could easily be
- * combined, but keeping it separate draws attention to
- * subtle differences.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- return false;
-}
-
static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
int nid = folio_nid(folio);
@@ -1336,6 +1274,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
if (folio_test_hwpoison(folio))
continue;
+ if (is_migrate_isolate_page(&folio->page))
+ continue;
+
list_move(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
folio_clear_hugetlb_freed(folio);
@@ -1394,8 +1335,7 @@ static unsigned long available_huge_pages(struct hstate *h)
static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve,
- long chg)
+ unsigned long address, long gbl_chg)
{
struct folio *folio = NULL;
struct mempolicy *mpol;
@@ -1404,15 +1344,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
int nid;
/*
- * A child process with MAP_PRIVATE mappings created by their parent
- * have no page reserves. This check ensures that reservations are
- * not "stolen". The child may still get SIGKILLed
+ * gbl_chg==1 means the allocation requires a new page that was not
+ * reserved before. Making sure there's at least one free page.
*/
- if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
- goto err;
-
- /* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && !available_huge_pages(h))
+ if (gbl_chg && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1430,11 +1365,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
- if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
-
mpol_cond_put(mpol);
return folio;
@@ -2205,6 +2135,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2227,7 +2159,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2247,7 +2181,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
@@ -2463,7 +2397,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long needed, allocated;
bool alloc_ok = true;
int node;
- nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ nodemask_t *mbind_nodemask, alloc_nodemask;
+
+ mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ if (mbind_nodemask)
+ nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
+ else
+ alloc_nodemask = cpuset_current_mems_allowed;
lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2479,8 +2419,16 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
folio = NULL;
- for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
+
+ /* Prioritize current node */
+ if (node_isset(numa_mem_id(), alloc_nodemask))
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ numa_mem_id(), NULL);
+
+ if (!folio) {
+ for_each_node_mask(node, alloc_nodemask) {
+ if (node == numa_mem_id())
+ continue;
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
node, NULL);
if (folio)
@@ -2868,7 +2816,7 @@ retry:
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- isolated = isolate_hugetlb(old_folio, list);
+ isolated = folio_isolate_hugetlb(old_folio, list);
ret = isolated ? 0 : -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
@@ -2953,7 +2901,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
+ if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
@@ -2961,69 +2909,137 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
return ret;
}
+/*
+ * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
+ * range with new folios.
+ * @start_pfn: start pfn of the given pfn range
+ * @end_pfn: end pfn of the given pfn range
+ * Returns 0 on success, otherwise negated error.
+ */
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct hstate *h;
+ struct folio *folio;
+ int ret = 0;
+
+ LIST_HEAD(isolate_list);
+
+ while (start_pfn < end_pfn) {
+ folio = pfn_folio(start_pfn);
+ if (folio_test_hugetlb(folio)) {
+ h = folio_hstate(folio);
+ } else {
+ start_pfn++;
+ continue;
+ }
+
+ if (!folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(h, folio,
+ &isolate_list);
+ if (ret)
+ break;
+
+ putback_movable_pages(&isolate_list);
+ }
+ start_pfn++;
+ }
+
+ return ret;
+}
+
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
+typedef enum {
+ /*
+ * For either 0/1: we checked the per-vma resv map, and one resv
+ * count either can be reused (0), or an extra needed (1).
+ */
+ MAP_CHG_REUSE = 0,
+ MAP_CHG_NEEDED = 1,
+ /*
+ * Cannot use per-vma resv count can be used, hence a new resv
+ * count is enforced.
+ *
+ * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
+ * that currently vma_needs_reservation() has an unwanted side
+ * effect to either use end() or commit() to complete the
+ * transaction. Hence it needs to differenciate from NEEDED.
+ */
+ MAP_CHG_ENFORCED = 2,
+} map_chg_state;
+
+/*
+ * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
+ * faults of hugetlb private mappings on top of a non-page-cache folio (in
+ * which case even if there's a private vma resv map it won't cover such
+ * allocation). New call sites should (probably) never set it to true!!
+ * When it's set, the allocation will bypass all vma level reservations.
+ */
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
+ unsigned long addr, bool cow_from_owner)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
- long gbl_chg;
- int memcg_charge_ret, ret, idx;
+ long retval, gbl_chg;
+ map_chg_state map_chg;
+ int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
- struct mem_cgroup *memcg;
- bool deferred_reserve;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
- memcg = get_mem_cgroup_from_current();
- memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
- if (memcg_charge_ret == -ENOMEM) {
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
- }
-
idx = hstate_index(h);
- /*
- * Examine the region/reserve map to determine if the process
- * has a reservation for the page to be allocated. A return
- * code of zero indicates a reservation exists (no change).
- */
- map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0) {
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
+
+ /* Whether we need a separate per-vma reservation? */
+ if (cow_from_owner) {
+ /*
+ * Special case! Since it's a CoW on top of a reserved
+ * page, the private resv map doesn't count. So it cannot
+ * consume the per-vma resv map even if it's reserved.
+ */
+ map_chg = MAP_CHG_ENFORCED;
+ } else {
+ /*
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
+ */
+ retval = vma_needs_reservation(h, vma, addr);
+ if (retval < 0)
+ return ERR_PTR(-ENOMEM);
+ map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
}
/*
+ * Whether we need a separate global reservation?
+ *
* Processes that did not create the mapping will have no
* reserves as indicated by the region/reserve map. Check
* that the allocation will not exceed the subpool limit.
- * Allocations for MAP_NORESERVE mappings also need to be
- * checked against any subpool limit.
+ * Or if it can get one from the pool reservation directly.
*/
- if (map_chg || avoid_reserve) {
+ if (map_chg) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0)
goto out_end_reservation;
-
+ } else {
/*
- * Even though there was no reservation in the region/reserve
- * map, there could be reservations associated with the
- * subpool that can be used. This would be indicated if the
- * return value of hugepage_subpool_get_pages() is zero.
- * However, if avoid_reserve is specified we still avoid even
- * the subpool reservations.
+ * If we have the vma reservation ready, no need for extra
+ * global reservation.
*/
- if (avoid_reserve)
- gbl_chg = 1;
+ gbl_chg = 0;
}
- /* If this allocation is not consuming a reservation, charge it now.
+ /*
+ * If this allocation is not consuming a per-vma reservation,
+ * charge the hugetlb cgroup now.
*/
- deferred_reserve = map_chg || avoid_reserve;
- if (deferred_reserve) {
+ if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
if (ret)
@@ -3040,27 +3056,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
if (!folio) {
spin_unlock_irq(&hugetlb_lock);
folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
if (!folio)
goto out_uncharge_cgroup;
spin_lock_irq(&hugetlb_lock);
- if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
list_add(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
/* Fall through */
}
+ /*
+ * Either dequeued or buddy-allocated folio needs to add special
+ * mark to the folio when it consumes a global reservation.
+ */
+ if (!gbl_chg) {
+ folio_set_hugetlb_restore_reserve(folio);
+ h->resv_huge_pages--;
+ }
+
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
- if (deferred_reserve) {
+ if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
h_cg, folio);
}
@@ -3069,50 +3090,61 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_set_folio_subpool(folio, spool);
- map_commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(map_chg > map_commit)) {
+ if (map_chg != MAP_CHG_ENFORCED) {
+ /* commit() is only needed if the map_chg is not enforced */
+ retval = vma_commit_reservation(h, vma, addr);
/*
+ * Check for possible race conditions. When it happens..
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
* This indicates a race with hugetlb_reserve_pages.
* Adjust for the subpool count incremented above AND
- * in hugetlb_reserve_pages for the same page. Also,
+ * in hugetlb_reserve_pages for the same page. Also,
* the reservation count added in hugetlb_reserve_pages
* no longer applies.
*/
- long rsv_adjust;
+ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
+ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1);
- hugetlb_acct_memory(h, -rsv_adjust);
- if (deferred_reserve) {
- spin_lock_irq(&hugetlb_lock);
- hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
- pages_per_huge_page(h), folio);
- spin_unlock_irq(&hugetlb_lock);
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ if (map_chg) {
+ spin_lock_irq(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_folio_rsvd(
+ hstate_index(h), pages_per_huge_page(h),
+ folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
}
}
- if (!memcg_charge_ret)
- mem_cgroup_commit_charge(folio, memcg);
+ ret = mem_cgroup_charge_hugetlb(folio, gfp);
+ /*
+ * Unconditionally increment NR_HUGETLB here. If it turns out that
+ * mem_cgroup_charge_hugetlb failed, then immediately free the page and
+ * decrement NR_HUGETLB.
+ */
lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
- mem_cgroup_put(memcg);
+
+ if (ret == -ENOMEM) {
+ free_huge_folio(folio);
+ return ERR_PTR(-ENOMEM);
+ }
return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
- if (deferred_reserve)
+ if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg || avoid_reserve)
+ if (map_chg)
hugepage_subpool_put_pages(spool, 1);
out_end_reservation:
- vma_end_reservation(h, vma, addr);
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
+ if (map_chg != MAP_CHG_ENFORCED)
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
@@ -3125,7 +3157,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
- m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+ m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!m)
return 0;
@@ -3289,7 +3321,7 @@ static void __init gather_bootmem_prealloc(void)
.thread_fn = gather_bootmem_prealloc_parallel,
.fn_arg = NULL,
.start = 0,
- .size = num_node_state(N_MEMORY),
+ .size = nr_node_ids,
.align = 1,
.min_chunk = 1,
.max_threads = num_node_state(N_MEMORY),
@@ -3806,13 +3838,15 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
struct page *page = folio_page(folio, i);
+ /* Careful: see __split_huge_page_tail() */
+ struct folio *new_folio = (struct folio *)page;
- page->mapping = NULL;
clear_compound_head(page);
prep_compound_page(page, dst->order);
- init_new_hugetlb_folio(dst, page_folio(page));
- list_add(&page->lru, &dst_list);
+ new_folio->mapping = NULL;
+ init_new_hugetlb_folio(dst, new_folio);
+ list_add(&new_folio->lru, &dst_list);
}
}
@@ -4845,7 +4879,7 @@ out:
return ret;
}
-static struct ctl_table hugetlb_table[] = {
+static const struct ctl_table hugetlb_table[] = {
{
.procname = "nr_hugepages",
.data = NULL,
@@ -5141,12 +5175,12 @@ const struct vm_operations_struct hugetlb_vm_ops = {
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
- int writable)
+ bool try_mkwrite)
{
pte_t entry;
unsigned int shift = huge_page_shift(hstate_vma(vma));
- if (writable) {
+ if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
vma->vm_page_prot)));
} else {
@@ -5169,6 +5203,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ if (vma->vm_flags & VM_WRITE)
+ set_huge_ptep_writable(vma, address, ptep);
+}
+
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
@@ -5199,7 +5240,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+ pte_t newpte = make_huge_pte(vma, &new_folio->page, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5333,7 +5374,7 @@ again:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
if (IS_ERR(new_folio)) {
folio_put(pte_folio);
ret = PTR_ERR(new_folio);
@@ -5418,7 +5459,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
@@ -5593,7 +5634,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
- pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -5799,7 +5840,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
struct folio *new_folio;
- int outside_reserve = 0;
+ bool cow_from_owner = 0;
vm_fault_t ret = 0;
struct mmu_notifier_range range;
@@ -5814,13 +5855,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
if (!unshare && huge_pte_uffd_wp(pte))
return 0;
- /*
- * hugetlb does not support FOLL_FORCE-style write faults that keep the
- * PTE mapped R/O such as maybe_mkwrite() would do.
- */
- if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
- return VM_FAULT_SIGSEGV;
-
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, vmf->address, vmf->pte);
@@ -5849,7 +5883,8 @@ retry_avoidcopy:
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, vmf->address, vmf->pte);
+ set_huge_ptep_maybe_writable(vma, vmf->address,
+ vmf->pte);
delayacct_wpcopy_end();
return 0;
@@ -5868,7 +5903,7 @@ retry_avoidcopy:
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_folio != pagecache_folio)
- outside_reserve = 1;
+ cow_from_owner = true;
folio_get(old_folio);
@@ -5877,7 +5912,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(vmf->ptl);
- new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
if (IS_ERR(new_folio)) {
/*
@@ -5887,7 +5922,7 @@ retry_avoidcopy:
* reliability, unmap the page from child processes. The child
* may get SIGKILLed if it later faults.
*/
- if (outside_reserve) {
+ if (cow_from_owner) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx;
u32 hash;
@@ -6138,7 +6173,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
goto out;
}
- folio = alloc_hugetlb_folio(vma, vmf->address, 0);
+ folio = alloc_hugetlb_folio(vma, vmf->address, false);
if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
@@ -6235,8 +6270,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
- && (vma->vm_flags & VM_SHARED)));
+ new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -6568,7 +6602,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
spinlock_t *ptl;
int ret = -ENOMEM;
struct folio *folio;
- int writable;
bool folio_in_pagecache = false;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
@@ -6606,7 +6639,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
@@ -6648,7 +6681,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
folio_put(*foliop);
ret = -ENOMEM;
@@ -6722,12 +6755,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- if (wp_enabled || (is_continue && !vm_shared))
- writable = 0;
- else
- writable = dst_vma->vm_flags & VM_WRITE;
-
- _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
+ _dst_pte = make_huge_pte(dst_vma, &folio->page,
+ !wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
* extremely important for hugetlbfs for now since swapping is not
@@ -7406,7 +7435,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-bool isolate_hugetlb(struct folio *folio, struct list_head *list)
+/**
+ * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
+ * @folio: the folio to isolate
+ * @list: the list to add the folio to on success
+ *
+ * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
+ * isolated/non-migratable, and moving it from the active list to the
+ * given list.
+ *
+ * Isolation will fail if @folio is not an allocated hugetlb folio, or if
+ * it is already isolated/non-migratable.
+ *
+ * On success, an additional folio reference is taken that must be dropped
+ * using folio_putback_hugetlb() to undo the isolation.
+ *
+ * Return: True if isolation worked, otherwise False.
+ */
+bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
bool ret = true;
@@ -7454,7 +7500,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
-void folio_putback_active_hugetlb(struct folio *folio)
+/**
+ * folio_putback_hugetlb - unisolate a hugetlb folio
+ * @folio: the isolated hugetlb folio
+ *
+ * Putback/un-isolate the hugetlb folio that was previous isolated using
+ * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
+ * back onto the active list.
+ *
+ * Will drop the additional folio reference obtained through
+ * folio_isolate_hugetlb().
+ */
+void folio_putback_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
folio_set_hugetlb_migratable(folio);
@@ -7501,6 +7558,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
spin_unlock_irq(&hugetlb_lock);
}
+
+ /*
+ * Our old folio is isolated and has "migratable" cleared until it
+ * is putback. As migration succeeded, set the new folio "migratable"
+ * and add it to the active list.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ folio_set_hugetlb_migratable(new_folio);
+ list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
+ spin_unlock_irq(&hugetlb_lock);
}
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index e716c4671a15..bb9578bd99f9 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -195,24 +195,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
* cannot fail.
*/
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct folio *folio)
{
unsigned int nr_pages;
struct page_counter *counter;
- struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
- struct folio *folio = page_folio(page);
- page_hcg = hugetlb_cgroup_from_folio(folio);
+ hcg = hugetlb_cgroup_from_folio(folio);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
* ignore those pages.
*/
- if (!page_hcg || page_hcg != h_cg)
+ if (!hcg || hcg != h_cg)
goto out;
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
@@ -235,13 +234,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
- struct page *page;
+ struct folio *folio;
do {
for_each_hstate(h) {
spin_lock_irq(&hugetlb_lock);
- list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
+ list_for_each_entry(folio, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
spin_unlock_irq(&hugetlb_lock);
}
@@ -917,7 +916,6 @@ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
list_move(&new_folio->lru, &h->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
- return;
}
static struct cftype hugetlb_files[] = {
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 57b7f591eee8..7735972add01 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -693,7 +693,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
free_vmemmap_page_list(&vmemmap_pages);
}
-static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
{
.procname = "hugetlb_optimize_vmemmap",
.data = &vmemmap_optimize_enabled,
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..6af3ad675930 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,7 +40,7 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
- .mm_lock_seq = 0,
+ .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
diff --git a/mm/internal.h b/mm/internal.h
index 9826f7dce607..20b3535935a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -392,6 +392,8 @@ void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
+int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
+ gfp_t gfp);
void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
unsigned int order);
@@ -735,15 +737,30 @@ static inline void prep_compound_tail(struct page *head, int tail_idx)
extern void prep_compound_page(struct page *page, unsigned int order);
-extern void post_alloc_hook(struct page *page, unsigned int order,
- gfp_t gfp_flags);
+void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);
extern int user_min_free_kbytes;
-void free_unref_page(struct page *page, unsigned int order);
+struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
+ nodemask_t *);
+#define __alloc_frozen_pages(...) \
+ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
+void free_frozen_pages(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);
+#ifdef CONFIG_NUMA
+struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
+#else
+static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
+{
+ return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
+}
+#endif
+
+#define alloc_frozen_pages(...) \
+ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
+
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -824,10 +841,6 @@ int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype);
-
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);
@@ -1102,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1125,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page,
struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
@@ -1440,22 +1454,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
int priority);
-#ifdef CONFIG_64BIT
-static inline int can_do_mseal(unsigned long flags)
-{
- if (flags)
- return -EINVAL;
-
- return 0;
-}
-
-#else
-static inline int can_do_mseal(unsigned long flags)
-{
- return -EPERM;
-}
-#endif
-
#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
struct shrinker *shrinker, const char *fmt, va_list ap)
@@ -1530,4 +1528,23 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+/* pt_reclaim.c */
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval);
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb);
+
+#ifdef CONFIG_PT_RECLAIM
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details);
+#else
+static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return false;
+}
+#endif /* CONFIG_PT_RECLAIM */
+
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 8b9e348113b1..d54e89f8c3e7 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -524,7 +524,11 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
sizeof(struct kasan_free_meta) : 0);
}
-static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
+/*
+ * This function avoids dynamic memory allocations and thus can be called from
+ * contexts that do not allow allocating memory.
+ */
+void kasan_record_aux_stack(void *addr)
{
struct slab *slab = kasan_addr_to_slab(addr);
struct kmem_cache *cache;
@@ -541,17 +545,7 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
-}
-
-void kasan_record_aux_stack(void *addr)
-{
- return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
-}
-
-void kasan_record_aux_stack_noalloc(void *addr)
-{
- return __kasan_record_aux_stack(addr, 0);
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, 0);
}
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index ccd66c7a4081..9a6927394b54 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/static_key.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
@@ -263,8 +264,8 @@ void __init kasan_init_hw_tags(void)
pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
kasan_mode_info(),
- kasan_vmalloc_enabled() ? "on" : "off",
- kasan_stack_collection_enabled() ? "on" : "off");
+ str_on_off(kasan_vmalloc_enabled()),
+ str_on_off(kasan_stack_collection_enabled()));
}
#ifdef CONFIG_KASAN_VMALLOC
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index b7e4b81421b3..129178be5e64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr)
/**
* kasan_poison - mark the memory range as inaccessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size, must be aligned to KASAN_GRANULE_SIZE
- * @value - value that's written to metadata for the range
- * @init - whether to initialize the memory range (only for hardware tag-based)
+ * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size, must be aligned to KASAN_GRANULE_SIZE
+ * @value: value that's written to metadata for the range
+ * @init: whether to initialize the memory range (only for hardware tag-based)
*/
void kasan_poison(const void *addr, size_t size, u8 value, bool init);
/**
* kasan_unpoison - mark the memory range as accessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size, can be unaligned
- * @init - whether to initialize the memory range (only for hardware tag-based)
+ * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size, can be unaligned
+ * @init: whether to initialize the memory range (only for hardware tag-based)
*
* For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
* marking the range.
@@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr);
/**
* kasan_poison_last_granule - mark the last granule of the memory range as
* inaccessible
- * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
- * @size - range size
+ * @address: range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size: range size
*
* This function is only available for the generic mode, as it's the only mode
* that has partially poisoned memory granules.
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 99d4ff0ed57a..59d673400085 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -47,8 +47,8 @@ static struct {
* Some tests use these global variables to store return values from function
* calls that could otherwise be eliminated by the compiler as dead code.
*/
-void *kasan_ptr_result;
-int kasan_int_result;
+static volatile void *kasan_ptr_result;
+static volatile int kasan_int_result;
/* Probe for console output: obtains test_status lines of interest. */
static void probe_console(void *ignore, const char *buf, size_t len)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3fe77a360f1c..8357e1a33699 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+/*
+ * This function is invoked with report_lock (a raw_spinlock) held. A
+ * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
+ * rt_spinlock.
+ *
+ * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
+ * lockdep warning for this raw_spinlock -> spinlock dependency. This config
+ * option is enabled by default to ensure better test coverage to expose this
+ * kind of RT kernel problem. This lockdep splat, however, can be suppressed
+ * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
+ * invalid PREEMPT_RT case has been taken care of.
+ */
+static inline struct vm_struct *kasan_find_vm_area(void *addr)
+{
+ static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
+ struct vm_struct *va;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return NULL;
+
+ /*
+ * Suppress lockdep warning and fetch vmalloc area of the
+ * offending address.
+ */
+ lock_map_acquire_try(&vmalloc_map);
+ va = find_vm_area(addr);
+ lock_map_release(&vmalloc_map);
+ return va;
+}
+
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = find_vm_area(addr);
+ struct vm_struct *va = kasan_find_vm_area(addr);
if (va) {
pr_err("The buggy address belongs to the virtual mapping at\n"
@@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag,
pr_err("\n");
page = vmalloc_to_page(addr);
+ } else {
+ pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
}
}
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 220b5d4c6876..b9382b5b6a37 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
@@ -45,7 +46,7 @@ void __init kasan_init_sw_tags(void)
kasan_init_tags();
pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
- kasan_stack_collection_enabled() ? "on" : "off");
+ str_on_off(kasan_stack_collection_enabled()));
}
/*
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 67fc321db79b..102048821c22 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -21,6 +21,7 @@
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
+#include <linux/nodemask.h>
#include <linux/notifier.h>
#include <linux/panic_notifier.h>
#include <linux/random.h>
@@ -1084,6 +1085,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
* properties (e.g. reside in DMAable memory).
*/
if ((flags & GFP_ZONEMASK) ||
+ ((flags & __GFP_THISNODE) && num_online_nodes() > 1) ||
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index f65fb182466d..00034e37bc9f 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/tracepoint.h>
#include <trace/events/printk.h>
@@ -88,7 +89,7 @@ struct expect_report {
static const char *get_access_type(const struct expect_report *r)
{
- return r->is_write ? "write" : "read";
+ return str_write_read(r->is_write);
}
/* Check observed report matches information in @r. */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index 6370c5207d1a..10e6802a2edf 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -16,6 +16,7 @@
#include <linux/sprintf.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/sched/clock.h>
#include <trace/events/error_report.h>
@@ -184,7 +185,7 @@ static void print_diff_canary(unsigned long address, size_t bytes_to_show,
static const char *get_access_type(bool is_write)
{
- return is_write ? "write" : "read";
+ return str_write_read(is_write);
}
void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index bad1e130eda8..5f0be134141e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -948,17 +948,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-static int find_pmd_or_thp_or_none(struct mm_struct *mm,
- unsigned long address,
- pmd_t **pmd)
+static inline int check_pmd_state(pmd_t *pmd)
{
- pmd_t pmde;
+ pmd_t pmde = pmdp_get_lockless(pmd);
- *pmd = mm_find_pmd(mm, address);
- if (!*pmd)
- return SCAN_PMD_NULL;
-
- pmde = pmdp_get_lockless(*pmd);
if (pmd_none(pmde))
return SCAN_PMD_NONE;
if (!pmd_present(pmde))
@@ -972,6 +965,17 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
return SCAN_SUCCEED;
}
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ return check_pmd_state(*pmd);
+}
+
static int check_pmd_still_valid(struct mm_struct *mm,
unsigned long address,
pmd_t *pmd)
@@ -1721,7 +1725,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
pmd_t *pmd, pgt_pmd;
spinlock_t *pml;
spinlock_t *ptl;
- bool skipped_uffd = false;
+ bool success = false;
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1758,6 +1762,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
mmu_notifier_invalidate_range_start(&range);
pml = pmd_lock(mm, pmd);
+ /*
+ * The lock of new_folio is still held, we will be blocked in
+ * the page fault path, which prevents the pte entries from
+ * being set again. So even though the old empty PTE page may be
+ * concurrently freed and a new PTE page is filled into the pmd
+ * entry, it is still empty and can be removed.
+ *
+ * So here we only need to recheck if the state of pmd entry
+ * still meets our requirements, rather than checking pmd_same()
+ * like elsewhere.
+ */
+ if (check_pmd_state(pmd) != SCAN_SUCCEED)
+ goto drop_pml;
ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
@@ -1771,20 +1788,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* repeating the anon_vma check protects from one category,
* and repeating the userfaultfd_wp() check from another.
*/
- if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
- skipped_uffd = true;
- } else {
+ if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
+ success = true;
}
if (ptl != pml)
spin_unlock(ptl);
+drop_pml:
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
- if (!skipped_uffd) {
+ if (success) {
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pgt_pmd);
pte_free_defer(mm, pmd_pgtable(pgt_pmd));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 820ba3b5cbfc..c6ed68604136 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1689,7 +1689,7 @@ static void kmemleak_scan(void)
unsigned long phys = object->pointer;
if (PHYS_PFN(phys) < min_low_pfn ||
- PHYS_PFN(phys + object->size) >= max_low_pfn)
+ PHYS_PFN(phys + object->size) > max_low_pfn)
__paint_it(object, KMEMLEAK_BLACK);
}
@@ -1855,7 +1855,7 @@ static int kmemleak_scan_thread(void *arg)
* Wait before the first scan to allow the system to fully initialize.
*/
if (first_run) {
- signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
+ signed long timeout = secs_to_jiffies(SECS_FIRST_SCAN);
first_run = 0;
while (timeout && !kthread_should_stop())
timeout = schedule_timeout_interruptible(timeout);
@@ -2241,7 +2241,7 @@ void __init kmemleak_init(void)
return;
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
- jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+ jiffies_scan_wait = secs_to_jiffies(SECS_SCAN_WAIT);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3ea50f09311f..3df45c25c1f6 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
+EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 9c58f081d84f..1bb505a08415 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -280,12 +280,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
start = (void *)PAGE_ALIGN_DOWN((u64)start);
size = PAGE_ALIGN((u64)end - (u64)start);
- shadow = memblock_alloc(size, PAGE_SIZE);
- origin = memblock_alloc(size, PAGE_SIZE);
-
- if (!shadow || !origin)
- panic("%s: Failed to allocate metadata memory for early boot range of size %llu",
- __func__, size);
+ shadow = memblock_alloc_or_panic(size, PAGE_SIZE);
+ origin = memblock_alloc_or_panic(size, PAGE_SIZE);
for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
page = virt_to_page_or_null((char *)start + addr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 31a9bc365437..8be2b144fefd 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3262,6 +3262,25 @@ static void wait_while_offlining(void)
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_PROC_FS
+/*
+ * The process is mergeable only if any VMA is currently
+ * applicable to KSM.
+ *
+ * The mmap lock must be held in read mode.
+ */
+bool ksm_process_mergeable(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma)
+ if (vma->vm_flags & VM_MERGEABLE)
+ return true;
+
+ return false;
+}
+
long ksm_process_profit(struct mm_struct *mm)
{
return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
diff --git a/mm/madvise.c b/mm/madvise.c
index 0ceae57da7da..08b207f8e61e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -851,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single(vma, start, end - start, &details);
return 0;
}
@@ -928,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
diff --git a/mm/memblock.c b/mm/memblock.c
index 095c18b5c430..95af35fd1389 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1692,6 +1692,26 @@ void * __init memblock_alloc_try_nid(
}
/**
+ * __memblock_alloc_or_panic - Try to allocate memory and panic on failure
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @func: caller func name
+ *
+ * This function attempts to allocate memory using memblock_alloc,
+ * and in case of failure, it calls panic with the formatted message.
+ * This function should not be used directly, please use the macro memblock_alloc_or_panic.
+ */
+void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
+ const char *func)
+{
+ void *addr = memblock_alloc(size, align);
+
+ if (unlikely(!addr))
+ panic("%s: Failed to allocate %pap bytes\n", func, &size);
+ return addr;
+}
+
+/**
* memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index a071fa43d479..2e9fa431bbf5 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -899,7 +899,7 @@ static void memcg_event_remove(struct work_struct *work)
*
* Called with wqh->lock held and interrupts disabled.
*/
-static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode,
int sync, void *key)
{
struct mem_cgroup_event *event =
@@ -1040,13 +1040,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
} else if (!strcmp(name, "memory.oom_control")) {
pr_warn_once("oom_control is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org"
- " if you depend on this functionality. \n");
+ " if you depend on this functionality.\n");
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
pr_warn_once("pressure_level is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org "
- "if you depend on this functionality. \n");
+ "if you depend on this functionality.\n");
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
@@ -1134,8 +1134,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
failed = iter;
mem_cgroup_iter_break(memcg, iter);
break;
- } else
- iter->oom_lock = true;
+ }
+ iter->oom_lock = true;
}
if (failed) {
@@ -1202,7 +1202,7 @@ struct oom_wait_info {
};
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
- unsigned mode, int sync, void *arg)
+ unsigned int mode, int sync, void *arg)
{
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
struct mem_cgroup *oom_wait_memcg;
@@ -1644,7 +1644,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
unsigned long nr = 0;
enum lru_list lru;
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
+ VM_BUG_ON((unsigned int)nid >= nr_node_ids);
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
@@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > MAX_SWAPPINESS)
return -EINVAL;
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
+ pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
+ "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
- else
+ } else
WRITE_ONCE(vm_swappiness, val);
return 0;
@@ -1881,7 +1883,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
pr_warn_once("oom_control is deprecated and will be removed. "
"Please report your usecase to linux-mm-@kvack.org if you "
- "depend on this functionality. \n");
+ "depend on this functionality.\n");
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3503d12aaf..a037ec92881d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
+ int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ cond_resched();
ret = fn(task, arg);
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
@@ -1448,6 +1453,18 @@ unsigned long memcg_page_state_local_output(struct mem_cgroup *memcg, int item)
memcg_page_state_output_unit(item);
}
+#ifdef CONFIG_HUGETLB_PAGE
+static bool memcg_accounts_hugetlb(void)
+{
+ return cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+}
+#else /* CONFIG_HUGETLB_PAGE */
+static bool memcg_accounts_hugetlb(void)
+{
+ return false;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
int i;
@@ -1469,7 +1486,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
#ifdef CONFIG_HUGETLB_PAGE
if (unlikely(memory_stats[i].idx == NR_HUGETLB) &&
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ !memcg_accounts_hugetlb())
continue;
#endif
size = memcg_page_state_output(memcg, memory_stats[i].idx);
@@ -1904,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old;
+ unsigned long flags;
stock = &per_cpu(memcg_stock, cpu);
+
+ /* drain_obj_stock requires stock_lock */
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ old = drain_obj_stock(stock);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+
drain_stock(stock);
+ obj_cgroup_put(old);
return 0;
}
@@ -2371,21 +2397,6 @@ done_restock:
return 0;
}
-/**
- * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
- * @memcg: memcg previously charged.
- * @nr_pages: number of pages previously charged.
- */
-void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- if (mem_cgroup_is_root(memcg))
- return;
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-}
-
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
@@ -2399,18 +2410,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-/**
- * mem_cgroup_commit_charge - commit a previously successful try_charge().
- * @folio: folio to commit the charge to.
- * @memcg: memcg previously charged.
- */
-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
-{
- css_get(&memcg->css);
- commit_charge(folio, memcg);
- memcg1_commit_charge(folio, memcg);
-}
-
static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
@@ -4176,6 +4175,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
+ cond_resched();
}
memcg_wb_domain_size_changed(memcg);
@@ -4498,7 +4498,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
if (ret)
goto out;
- mem_cgroup_commit_charge(folio, memcg);
+ css_get(&memcg->css);
+ commit_charge(folio, memcg);
+ memcg1_commit_charge(folio, memcg);
out:
return ret;
}
@@ -4516,38 +4518,37 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
- * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
- * @memcg: memcg to charge.
- * @gfp: reclaim mode.
- * @nr_pages: number of pages to charge.
- *
- * This function is called when allocating a huge page folio to determine if
- * the memcg has the capacity for it. It does not commit the charge yet,
- * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ * mem_cgroup_charge_hugetlb - charge the memcg for a hugetlb folio
+ * @folio: folio being charged
+ * @gfp: reclaim mode
*
- * Once we have obtained the hugetlb folio, we can call
- * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
- * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
- * of try_charge().
+ * This function is called when allocating a huge page folio, after the page has
+ * already been obtained and charged to the appropriate hugetlb cgroup
+ * controller (if it is enabled).
*
- * Returns 0 on success. Otherwise, an error code is returned.
+ * Returns ENOMEM if the memcg is already full.
+ * Returns 0 if either the charge was successful, or if we skip the charging.
*/
-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
- long nr_pages)
+int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_current();
+ int ret = 0;
+
/*
- * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
- * but do not attempt to commit charge later (or cancel on error) either.
+ * Even memcg does not account for hugetlb, we still want to update
+ * system-level stats via lruvec_stat_mod_folio. Return 0, and skip
+ * charging the memcg.
*/
- if (mem_cgroup_disabled() || !memcg ||
- !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
- !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
- return -EOPNOTSUPP;
+ if (mem_cgroup_disabled() || !memcg_accounts_hugetlb() ||
+ !memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ goto out;
- if (try_charge(memcg, gfp, nr_pages))
- return -ENOMEM;
+ if (charge_memcg(folio, memcg, gfp))
+ ret = -ENOMEM;
- return 0;
+out:
+ mem_cgroup_put(memcg);
+ return ret;
}
/**
@@ -4609,7 +4610,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
- if (!mem_cgroup_disabled() && do_memsw_account()) {
+ if (do_memsw_account()) {
/*
* The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a
@@ -4973,7 +4974,6 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
- unsigned short oldid;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
@@ -5000,11 +5000,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
/* Get references for the tail pages, too */
if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
- nr_entries);
- VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
+
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
@@ -5035,7 +5034,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
- unsigned short oldid;
if (do_memsw_account())
return 0;
@@ -5064,10 +5062,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
/* Get references for the tail pages, too */
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
+ swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+
return 0;
}
@@ -5081,7 +5079,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- id = swap_cgroup_record(entry, 0, nr_pages);
+ id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
diff --git a/mm/memfd.c b/mm/memfd.c
index 35a370d75c9a..37f7be57c2f5 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
return error;
}
-unsigned int *memfd_file_seals_ptr(struct file *file)
+static unsigned int *memfd_file_seals_ptr(struct file *file)
{
if (shmem_file(file))
return &SHMEM_I(file_inode(file))->seals;
@@ -327,15 +327,51 @@ static int check_sysctl_memfd_noexec(unsigned int *flags)
return 0;
}
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
+static inline bool is_write_sealed(unsigned int seals)
{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
+ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+static int check_write_seal(unsigned long *vm_flags_ptr)
+{
+ unsigned long vm_flags = *vm_flags_ptr;
+ unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+
+ /* If a private matting then writability is irrelevant. */
+ if (!(mask & VM_SHARED))
+ return 0;
+
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * write seals are active.
+ */
+ if (mask & VM_WRITE)
+ return -EPERM;
+
+ /*
+ * This is a read-only mapping, disallow mprotect() from making a
+ * write-sealed mapping writable in future.
+ */
+ *vm_flags_ptr &= ~VM_MAYWRITE;
+
+ return 0;
+}
+
+int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+{
+ int err = 0;
+ unsigned int *seals_ptr = memfd_file_seals_ptr(file);
+ unsigned int seals = seals_ptr ? *seals_ptr : 0;
+
+ if (is_write_sealed(seals))
+ err = check_write_seal(vm_flags_ptr);
+
+ return err;
+}
+
+static int sanitize_flags(unsigned int *flags_ptr)
+{
+ unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
@@ -351,50 +387,52 @@ SYSCALL_DEFINE2(memfd_create,
if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
return -EINVAL;
- error = check_sysctl_memfd_noexec(&flags);
- if (error < 0)
- return error;
+ return check_sysctl_memfd_noexec(flags_ptr);
+}
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
+static char *alloc_name(const char __user *uname)
+{
+ int error;
+ char *name;
+ long len;
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ /* returned length does not include terminating zero */
+ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
+ if (len < 0) {
error = -EFAULT;
goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
+ } else if (len > MFD_NAME_MAX_LEN) {
+ error = -EINVAL;
goto err_name;
}
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
+ return name;
+
+err_name:
+ kfree(name);
+ return ERR_PTR(error);
+}
+
+static struct file *alloc_file(const char *name, unsigned int flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
- } else
+ } else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
}
+ if (IS_ERR(file))
+ return file;
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
@@ -414,6 +452,37 @@ SYSCALL_DEFINE2(memfd_create,
*file_seals &= ~F_SEAL_SEAL;
}
+ return file;
+}
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ struct file *file;
+ int fd, error;
+ char *name;
+
+ error = sanitize_flags(&flags);
+ if (error < 0)
+ return error;
+
+ name = alloc_name(uname);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ file = alloc_file(name, flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+
fd_install(fd, file);
kfree(name);
return fd;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a7b8ccd29b6f..327e02fdc029 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -124,7 +124,7 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};
-static struct ctl_table memory_failure_table[] = {
+static const struct ctl_table memory_failure_table[] = {
{
.procname = "memory_failure_early_kill",
.data = &sysctl_memory_failure_early_kill,
@@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- struct address_space *mapping;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
diff --git a/mm/memory.c b/mm/memory.c
index 398c031be9ba..4f6d9766a046 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
static inline bool should_zap_cows(struct zap_details *details)
{
/* By default, zap all pages */
- if (!details)
+ if (!details || details->reclaim_pt)
return true;
/* Or, we zap COWed pages only if the caller wants to */
@@ -1466,34 +1466,42 @@ static inline bool zap_drop_markers(struct zap_details *details)
/*
* This function makes sure that we'll replace the none pte with an uffd-wp
* swap special pte marker when necessary. Must be with the pgtable lock held.
+ *
+ * Returns true if uffd-wp ptes was installed, false otherwise.
*/
-static inline void
+static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
+ bool was_installed = false;
+
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
- return;
+ return false;
if (zap_drop_markers(details))
- return;
+ return false;
for (;;) {
/* the PFN in the PTE is irrelevant. */
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
+ was_installed = true;
if (--nr == 0)
break;
pte++;
addr += PAGE_SIZE;
}
+#endif
+ return was_installed;
}
static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct folio *folio,
struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
unsigned long addr, struct zap_details *details, int *rss,
- bool *force_flush, bool *force_break)
+ bool *force_flush, bool *force_break, bool *any_skipped)
{
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
@@ -1519,8 +1527,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entries(tlb, pte, nr, addr);
if (unlikely(userfaultfd_pte_wp(vma, ptent)))
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
- ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
+ nr, details, ptent);
if (!delay_rmap) {
folio_remove_rmap_ptes(folio, page, nr, vma);
@@ -1544,7 +1552,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *force_flush,
- bool *force_break)
+ bool *force_break, bool *any_skipped)
{
const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
@@ -1559,15 +1567,17 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
if (userfaultfd_pte_wp(vma, ptent))
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
- details, ptent);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
+ pte, 1, details, ptent);
ksm_might_unmap_zero_page(mm, ptent);
return 1;
}
folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
+ if (unlikely(!should_zap_folio(details, folio))) {
+ *any_skipped = true;
return 1;
+ }
/*
* Make sure that the common "small folio" case is as fast as possible
@@ -1579,14 +1589,121 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
- force_break);
+ force_break, any_skipped);
return nr;
}
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
- details, rss, force_flush, force_break);
+ details, rss, force_flush, force_break, any_skipped);
return 1;
}
+static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *any_skipped)
+{
+ swp_entry_t entry;
+ int nr = 1;
+
+ *any_skipped = true;
+ entry = pte_to_swp_entry(ptent);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = page_folio(page);
+
+ if (unlikely(!should_zap_folio(details, folio)))
+ return 1;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
+ rss[mm_counter(folio)]--;
+ if (is_device_private_entry(entry))
+ folio_remove_rmap_pte(folio, page, vma);
+ folio_put(folio);
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entries, hence a private anon pages */
+ if (!should_zap_cows(details))
+ return 1;
+
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ } else if (is_migration_entry(entry)) {
+ struct folio *folio = pfn_swap_entry_folio(entry);
+
+ if (!should_zap_folio(details, folio))
+ return 1;
+ rss[mm_counter(folio)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
+ return 1;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
+ return 1;
+ } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ if (!should_zap_cows(details))
+ return 1;
+ } else {
+ /* We should have covered all the swap entry types */
+ pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
+ WARN_ON_ONCE(1);
+ }
+ clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
+ *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
+
+ return nr;
+}
+
+static inline int do_zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pte_t *pte,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break,
+ bool *any_skipped)
+{
+ pte_t ptent = ptep_get(pte);
+ int max_nr = (end - addr) / PAGE_SIZE;
+ int nr = 0;
+
+ /* Skip all consecutive none ptes */
+ if (pte_none(ptent)) {
+ for (nr = 1; nr < max_nr; nr++) {
+ ptent = ptep_get(pte + nr);
+ if (!pte_none(ptent))
+ break;
+ }
+ max_nr -= nr;
+ if (!max_nr)
+ return nr;
+ pte += nr;
+ addr += nr * PAGE_SIZE;
+ }
+
+ if (pte_present(ptent))
+ nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, force_flush, force_break,
+ any_skipped);
+ else
+ nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
+ details, rss, any_skipped);
+
+ return nr;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1598,9 +1715,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+ pmd_t pmdval;
+ unsigned long start = addr;
+ bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
+ bool direct_reclaim = true;
int nr;
+retry:
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1610,90 +1731,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = ptep_get(pte);
- struct folio *folio;
- struct page *page;
- int max_nr;
+ bool any_skipped = false;
- nr = 1;
- if (pte_none(ptent))
- continue;
-
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
-
- if (pte_present(ptent)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
- addr, details, rss, &force_flush,
- &force_break);
- if (unlikely(force_break)) {
- addr += nr * PAGE_SIZE;
- break;
- }
- continue;
}
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- folio = page_folio(page);
- if (unlikely(!should_zap_folio(details, folio)))
- continue;
- /*
- * Both device private/exclusive mappings should only
- * work with anonymous page so far, so we don't need to
- * consider uffd-wp bit when zap. For more information,
- * see zap_install_uffd_wp_if_needed().
- */
- WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(folio)]--;
- if (is_device_private_entry(entry))
- folio_remove_rmap_pte(folio, page, vma);
- folio_put(folio);
- } else if (!non_swap_entry(entry)) {
- max_nr = (end - addr) / PAGE_SIZE;
- nr = swap_pte_batch(pte, max_nr, ptent);
- /* Genuine swap entries, hence a private anon pages */
- if (!should_zap_cows(details))
- continue;
- rss[MM_SWAPENTS] -= nr;
- free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
- if (!should_zap_folio(details, folio))
- continue;
- rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
- /*
- * For anon: always drop the marker; for file: only
- * drop the marker if explicitly requested.
- */
- if (!vma_is_anonymous(vma) &&
- !zap_drop_markers(details))
- continue;
- } else if (is_guard_swp_entry(entry)) {
- /*
- * Ordinary zapping should not remove guard PTE
- * markers. Only do so if we should remove PTE markers
- * in general.
- */
- if (!zap_drop_markers(details))
- continue;
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
- if (!should_zap_cows(details))
- continue;
- } else {
- /* We should have covered all the swap entry types */
- pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
- WARN_ON_ONCE(1);
+ nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
+ &force_flush, &force_break, &any_skipped);
+ if (any_skipped)
+ can_reclaim_pt = false;
+ if (unlikely(force_break)) {
+ addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
+ break;
}
- clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
+ direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
+
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
@@ -1713,6 +1779,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (force_flush)
tlb_flush_mmu(tlb);
+ if (addr != end) {
+ cond_resched();
+ force_flush = false;
+ force_break = false;
+ goto retry;
+ }
+
+ if (can_reclaim_pt) {
+ if (direct_reclaim)
+ free_pte(mm, start, tlb, pmdval);
+ else
+ try_to_free_pte(mm, pmd, start, tlb);
+ }
+
return addr;
}
@@ -1935,7 +2015,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
struct mmu_notifier_range range;
struct mmu_gather tlb;
- lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
@@ -2971,8 +3050,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -3013,7 +3094,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
{
return __apply_to_page_range(mm, addr, size, fn, data, false);
}
-EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry which was
@@ -4189,8 +4269,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
return folio;
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
folio_put(folio);
}
+ count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
order = next_order(&orders, order);
}
@@ -5102,7 +5184,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5141,7 +5227,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5177,9 +5264,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -5625,7 +5712,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
ignore_writable = true;
/* Migrate to the requested node */
- if (!migrate_misplaced_folio(folio, vma, target_nid)) {
+ if (!migrate_misplaced_folio(folio, target_nid)) {
nid = target_nid;
flags |= TNF_MIGRATED;
task_numa_fault(last_cpupid, nid, nr_pages, flags);
@@ -6069,7 +6156,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
}
/*
- * By the time we get here, we already hold the mm semaphore
+ * By the time we get here, we already hold either the VMA lock or the
+ * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __folio_lock_or_retry().
@@ -6185,7 +6273,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r
/*
* Helper for page fault handling.
*
- * This is kind of equivalend to "mmap_read_lock()" followed
+ * This is kind of equivalent to "mmap_read_lock()" followed
* by "find_extend_vma()", except it's a lot more careful about
* the locking (and will drop the lock on failure).
*
@@ -6746,10 +6834,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
@@ -6961,7 +7047,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc)
void ptlock_free(struct ptdesc *ptdesc)
{
- kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
+ if (ptdesc->ptl)
+ kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c43b4e7fb298..16cf9e17077e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -219,11 +219,30 @@ void put_online_mems(void)
bool movable_node_enabled = false;
-#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int mhp_default_online_type = MMOP_OFFLINE;
-#else
-int mhp_default_online_type = MMOP_ONLINE;
-#endif
+static int mhp_default_online_type = -1;
+int mhp_get_default_online_type(void)
+{
+ if (mhp_default_online_type >= 0)
+ return mhp_default_online_type;
+
+ if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE))
+ mhp_default_online_type = MMOP_OFFLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO))
+ mhp_default_online_type = MMOP_ONLINE;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL))
+ mhp_default_online_type = MMOP_ONLINE_KERNEL;
+ else if (IS_ENABLED(CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE))
+ mhp_default_online_type = MMOP_ONLINE_MOVABLE;
+ else
+ mhp_default_online_type = MMOP_OFFLINE;
+
+ return mhp_default_online_type;
+}
+
+void mhp_set_default_online_type(int online_type)
+{
+ mhp_default_online_type = online_type;
+}
static int __init setup_memhp_default_state(char *str)
{
@@ -650,6 +669,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
* this and the first chunk to online will be pageblock_nr_pages.
*/
for (pfn = start_pfn; pfn < end_pfn;) {
+ struct page *page = pfn_to_page(pfn);
int order;
/*
@@ -664,7 +684,14 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
else
order = MAX_PAGE_ORDER;
- (*online_page_callback)(pfn_to_page(pfn), order);
+ /*
+ * Exposing the page to the buddy by freeing can cause
+ * issues with debug_pagealloc enabled: some archs don't
+ * like double-unmappings. So treat them like any pages that
+ * were allocated from the buddy.
+ */
+ debug_pagealloc_map_pages(page, 1 << order);
+ (*online_page_callback)(page, order);
pfn += (1UL << order);
}
@@ -1320,7 +1347,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = mhp_default_online_type;
+ mem->online_type = mhp_get_default_online_type();
return device_online(&mem->dev);
}
@@ -1567,7 +1594,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
merge_system_ram_resource(res);
/* online pages if requested */
- if (mhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_get_default_online_type() != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1795,26 +1822,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (keep the unmap as the catch all safety net).
- */
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
if (folio_test_hwpoison(folio) ||
(folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
- if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
- continue;
- }
-
- if (!folio_try_get(folio))
- continue;
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
- if (unlikely(page_folio(page) != folio))
goto put_folio;
+ }
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
@@ -1830,7 +1855,7 @@ put_folio:
nodemask_t nmask = node_states[N_MEMORY];
struct migration_target_control mtc = {
.nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = GFP_KERNEL | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
.reason = MR_MEMORY_HOTPLUG,
};
int ret;
@@ -1992,8 +2017,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE,
- GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
+ MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 162407fbf2bc..a9eea051b0d6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -647,7 +678,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
(!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
- if (!isolate_hugetlb(folio, qp->pagelist))
+ if (!folio_isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
spin_unlock(ptl);
@@ -2205,9 +2236,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
if (!page)
- page = __alloc_pages_noprof(gfp, order, nid, NULL);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
return page;
}
@@ -2222,7 +2253,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
+static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
nodemask_t *nodemask;
@@ -2253,8 +2284,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node_noprof(nid,
- gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ page = __alloc_frozen_pages_noprof(
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order,
+ nid, NULL);
if (page || !(gfp & __GFP_DIRECT_RECLAIM))
return page;
/*
@@ -2266,7 +2298,7 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
}
}
- page = __alloc_pages_noprof(gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
if (unlikely(pol->mode == MPOL_INTERLEAVE ||
pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
@@ -2285,8 +2317,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
- return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP,
- order, pol, ilx, nid));
+ struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
+ ilx, nid);
+ if (!page)
+ return NULL;
+
+ set_page_refcounted(page);
+ return page_rmappable_folio(page);
}
/**
@@ -2300,7 +2337,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
* VMA to prevent it from going away. Should be used for all allocations
* for folios that will be mapped into user space, excepting hugetlbfs, and
- * excepting where direct use of alloc_pages_mpol() is more appropriate.
+ * excepting where direct use of folio_alloc_mpol() is more appropriate.
*
* Return: The folio on success or NULL if allocation fails.
*/
@@ -2321,6 +2358,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);
+struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = &default_policy;
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
+ numa_node_id());
+}
+
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
@@ -2337,17 +2389,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof);
*/
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
- struct mempolicy *pol = &default_policy;
-
- /*
- * No reference counting needed for current->mempolicy
- * nor system default_policy
- */
- if (!in_interrupt() && !(gfp & __GFP_THISNODE))
- pol = get_task_policy(current);
+ struct page *page = alloc_frozen_pages_noprof(gfp, order);
- return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
- numa_node_id());
+ if (page)
+ set_page_refcounted(page);
+ return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);
@@ -2357,7 +2403,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
}
EXPORT_SYMBOL(folio_alloc_noprof);
-static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2376,13 +2422,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
if (delta) {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node + 1, NULL,
+ nr_pages_per_node + 1,
page_array);
delta--;
} else {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node, NULL, page_array);
+ nr_pages_per_node, page_array);
}
page_array += nr_allocated;
@@ -2392,7 +2438,7 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2431,7 +2477,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (weight && node_isset(node, nodes)) {
node_pages = min(rem_pages, weight);
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
/* if that's all the pages, no need to interleave */
@@ -2494,7 +2540,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (!node_pages)
break;
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
if (total_allocated == nr_pages)
@@ -2507,7 +2553,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2518,11 +2564,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
if (nr_allocated < nr_pages)
nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
- nr_pages - nr_allocated, NULL,
+ nr_pages - nr_allocated,
page_array + nr_allocated);
return nr_allocated;
}
@@ -2533,7 +2579,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
@@ -2544,21 +2590,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_pages_bulk_array_interleave(gfp, pol,
+ return alloc_pages_bulk_interleave(gfp, pol,
nr_pages, page_array);
if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
- return alloc_pages_bulk_array_weighted_interleave(
+ return alloc_pages_bulk_weighted_interleave(
gfp, pol, nr_pages, page_array);
if (pol->mode == MPOL_PREFERRED_MANY)
- return alloc_pages_bulk_array_preferred_many(gfp,
+ return alloc_pages_bulk_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
return alloc_pages_bulk_noprof(gfp, nid, nodemask,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
diff --git a/mm/migrate.c b/mm/migrate.c
index cc68583c86f9..97f0edf0c032 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -68,10 +68,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
if (!folio)
goto out;
- if (unlikely(folio_test_slab(folio)))
- goto out_putfolio;
- /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
- smp_rmb();
/*
* Check movable flag before taking the page lock because
* we use non-atomic bitops on newly allocated page flags so
@@ -79,10 +75,6 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
*/
if (unlikely(!__folio_test_movable(folio)))
goto out_putfolio;
- /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
- smp_rmb();
- if (unlikely(folio_test_slab(folio)))
- goto out_putfolio;
/*
* As movable pages are not isolated from LRU lists, concurrent
@@ -136,7 +128,7 @@ static void putback_movable_folio(struct folio *folio)
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_hugetlb().
+ * and folio_isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -145,7 +137,7 @@ void putback_movable_pages(struct list_head *l)
list_for_each_entry_safe(folio, folio2, l, lru) {
if (unlikely(folio_test_hugetlb(folio))) {
- folio_putback_active_hugetlb(folio);
+ folio_putback_hugetlb(folio);
continue;
}
list_del(&folio->lru);
@@ -177,7 +169,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
bool isolated, lru;
if (folio_test_hugetlb(folio))
- return isolate_hugetlb(folio, list);
+ return folio_isolate_hugetlb(folio, list);
lru = !__folio_test_movable(folio);
if (lru)
@@ -526,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping,
if (folio_test_anon(folio) && folio_test_large(folio))
mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
- if (folio_test_swapbacked(folio)) {
+ if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
- if (folio_test_swapcache(folio)) {
- folio_set_swapcache(newfolio);
- newfolio->private = folio_get_private(folio);
- }
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
entries = nr;
} else {
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
entries = 1;
}
@@ -1459,7 +1449,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
- folio_putback_active_hugetlb(src);
+ folio_putback_hugetlb(src);
return MIGRATEPAGE_SUCCESS;
}
@@ -1542,19 +1532,19 @@ out_unlock:
folio_unlock(src);
out:
if (rc == MIGRATEPAGE_SUCCESS)
- folio_putback_active_hugetlb(src);
+ folio_putback_hugetlb(src);
else if (rc != -EAGAIN)
list_move_tail(&src->lru, ret);
/*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, put_page() will drop the reference grabbed during
- * isolation.
+ * If migration was not successful and there's a freeing callback,
+ * return the folio to that special allocator. Otherwise, simply drop
+ * our additional reference.
*/
if (put_new_folio)
put_new_folio(dst, private);
else
- folio_putback_active_hugetlb(dst);
+ folio_put(dst);
return rc;
}
@@ -1695,6 +1685,81 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
return nr_failed;
}
+static void migrate_folios_move(struct list_head *src_folios,
+ struct list_head *dst_folios,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct list_head *ret_folios,
+ struct migrate_pages_stats *stats,
+ int *retry, int *thp_retry, int *nr_failed,
+ int *nr_retry_pages)
+{
+ struct folio *folio, *folio2, *dst, *dst2;
+ bool is_thp;
+ int nr_pages;
+ int rc;
+
+ dst = list_first_entry(dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+ nr_pages = folio_nr_pages(folio);
+
+ cond_resched();
+
+ rc = migrate_folio_move(put_new_folio, private,
+ folio, dst, mode,
+ reason, ret_folios);
+ /*
+ * The rules are:
+ * Success: folio will be freed
+ * -EAGAIN: stay on the unmap_folios list
+ * Other errno: put on ret_folios list
+ */
+ switch (rc) {
+ case -EAGAIN:
+ *retry += 1;
+ *thp_retry += is_thp;
+ *nr_retry_pages += nr_pages;
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ stats->nr_succeeded += nr_pages;
+ stats->nr_thp_succeeded += is_thp;
+ break;
+ default:
+ *nr_failed += 1;
+ stats->nr_thp_failed += is_thp;
+ stats->nr_failed_pages += nr_pages;
+ break;
+ }
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+}
+
+static void migrate_folios_undo(struct list_head *src_folios,
+ struct list_head *dst_folios,
+ free_folio_t put_new_folio, unsigned long private,
+ struct list_head *ret_folios)
+{
+ struct folio *folio, *folio2, *dst, *dst2;
+
+ dst = list_first_entry(dst_folios, struct folio, lru);
+ dst2 = list_next_entry(dst, lru);
+ list_for_each_entry_safe(folio, folio2, src_folios, lru) {
+ int old_page_state = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+ migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+ anon_vma, true, ret_folios);
+ list_del(&dst->lru);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
+ dst = dst2;
+ dst2 = list_next_entry(dst, lru);
+ }
+}
+
/*
* migrate_pages_batch() first unmaps folios in the from list as many as
* possible, then move the unmapped folios.
@@ -1717,7 +1782,7 @@ static int migrate_pages_batch(struct list_head *from,
int pass = 0;
bool is_thp = false;
bool is_large = false;
- struct folio *folio, *folio2, *dst = NULL, *dst2;
+ struct folio *folio, *folio2, *dst = NULL;
int rc, rc_saved = 0, nr_pages;
LIST_HEAD(unmap_folios);
LIST_HEAD(dst_folios);
@@ -1888,42 +1953,11 @@ move:
thp_retry = 0;
nr_retry_pages = 0;
- dst = list_first_entry(&dst_folios, struct folio, lru);
- dst2 = list_next_entry(dst, lru);
- list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
- nr_pages = folio_nr_pages(folio);
-
- cond_resched();
-
- rc = migrate_folio_move(put_new_folio, private,
- folio, dst, mode,
- reason, ret_folios);
- /*
- * The rules are:
- * Success: folio will be freed
- * -EAGAIN: stay on the unmap_folios list
- * Other errno: put on ret_folios list
- */
- switch(rc) {
- case -EAGAIN:
- retry++;
- thp_retry += is_thp;
- nr_retry_pages += nr_pages;
- break;
- case MIGRATEPAGE_SUCCESS:
- stats->nr_succeeded += nr_pages;
- stats->nr_thp_succeeded += is_thp;
- break;
- default:
- nr_failed++;
- stats->nr_thp_failed += is_thp;
- stats->nr_failed_pages += nr_pages;
- break;
- }
- dst = dst2;
- dst2 = list_next_entry(dst, lru);
- }
+ /* Move the unmapped folios */
+ migrate_folios_move(&unmap_folios, &dst_folios,
+ put_new_folio, private, mode, reason,
+ ret_folios, stats, &retry, &thp_retry,
+ &nr_failed, &nr_retry_pages);
}
nr_failed += retry;
stats->nr_thp_failed += thp_retry;
@@ -1932,20 +1966,8 @@ move:
rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
- dst = list_first_entry(&dst_folios, struct folio, lru);
- dst2 = list_next_entry(dst, lru);
- list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- int old_page_state = 0;
- struct anon_vma *anon_vma = NULL;
-
- __migrate_folio_extract(dst, &old_page_state, &anon_vma);
- migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
- anon_vma, true, ret_folios);
- list_del(&dst->lru);
- migrate_folio_undo_dst(dst, true, put_new_folio, private);
- dst = dst2;
- dst2 = list_next_entry(dst, lru);
- }
+ migrate_folios_undo(&unmap_folios, &dst_folios,
+ put_new_folio, private, ret_folios);
return rc;
}
@@ -2208,7 +2230,7 @@ static int __add_folio_for_migration(struct folio *folio, int node,
return -EACCES;
if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(folio, pagelist))
+ if (folio_isolate_hugetlb(folio, pagelist))
return 1;
} else if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, pagelist);
@@ -2683,8 +2705,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio,
* elevated reference count on the folio. This function will un-isolate the
* folio, dereferencing the folio before returning.
*/
-int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
- int node)
+int migrate_misplaced_folio(struct folio *folio, int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int nr_remaining;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 9cf26592ac93..5bd888223cc8 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns,
dst = src;
}
+ if (!folio_is_zone_device(dst))
+ folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
folio_unlock(src);
-
- if (folio_is_zone_device(src))
- folio_put(src);
- else
- folio_putback_lru(src);
+ folio_put(src);
if (dst != src) {
folio_unlock(dst);
- if (folio_is_zone_device(dst))
- folio_put(dst);
- else
- folio_putback_lru(dst);
+ folio_put(dst);
}
}
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 24b68b425afb..2630cc30147e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1585,13 +1585,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
{
void *ptr;
+ /*
+ * Kmemleak will explicitly scan mem_map by traversing all valid
+ * `struct *page`,so memblock does not need to be added to the scan list.
+ */
if (exact_nid)
ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
else
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
if (ptr && size > 0)
diff --git a/mm/mmap.c b/mm/mmap.c
index aec208f90337..cda01071c7b1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -111,8 +111,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
return mlock_future_ok(current->mm, current->mm->def_flags, len)
? 0 : -EAGAIN;
}
-static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
- unsigned long addr, unsigned long request, unsigned long flags);
+
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
@@ -278,8 +277,62 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
return true;
}
-/*
+/**
+ * do_mmap() - Perform a userland memory mapping into the current process
+ * address space of length @len with protection bits @prot, mmap flags @flags
+ * (from which VMA flags will be inferred), and any additional VMA flags to
+ * apply @vm_flags. If this is a file-backed mapping then the file is specified
+ * in @file and page offset into the file via @pgoff.
+ *
+ * This function does not perform security checks on the file and assumes, if
+ * @uf is non-NULL, the caller has provided a list head to track unmap events
+ * for userfaultfd @uf.
+ *
+ * It also simply indicates whether memory population is required by setting
+ * @populate, which must be non-NULL, expecting the caller to actually perform
+ * this task itself if appropriate.
+ *
+ * This function will invoke architecture-specific (and if provided and
+ * relevant, file system-specific) logic to determine the most appropriate
+ * unmapped area in which to place the mapping if not MAP_FIXED.
+ *
+ * Callers which require userland mmap() behaviour should invoke vm_mmap(),
+ * which is also exported for module use.
+ *
+ * Those which require this behaviour less security checks, userfaultfd and
+ * populate behaviour, and who handle the mmap write lock themselves, should
+ * call this function.
+ *
+ * Note that the returned address may reside within a merged VMA if an
+ * appropriate merge were to take place, so it doesn't necessarily specify the
+ * start of a VMA, rather only the start of a valid mapped range of length
+ * @len bytes, rounded down to the nearest page size.
+ *
* The caller must write-lock current->mm->mmap_lock.
+ *
+ * @file: An optional struct file pointer describing the file which is to be
+ * mapped, if a file-backed mapping.
+ * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the
+ * address at which to perform this mapping. See mmap (2) for details. Must be
+ * page-aligned.
+ * @len: The length of the mapping. Will be page-aligned and must be at least 1
+ * page in size.
+ * @prot: Protection bits describing access required to the mapping. See mmap
+ * (2) for details.
+ * @flags: Flags specifying how the mapping should be performed, see mmap (2)
+ * for details.
+ * @vm_flags: VMA flags which should be set by default, or 0 otherwise.
+ * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise.
+ * @populate: A pointer to a value which will be set to 0 if no population of
+ * the range is required, or the number of bytes to populate if it is. Must be
+ * non-NULL. See mmap (2) for details as to under what circumstances population
+ * of the range occurs.
+ * @uf: An optional pointer to a list head to track userfaultfd unmap events
+ * should unmapping events arise. If provided, it is up to the caller to manage
+ * this.
+ *
+ * Returns: Either an error, or the address at which the requested mapping has
+ * been performed.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
@@ -292,6 +345,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
*populate = 0;
+ mmap_assert_write_locked(mm);
+
if (!len)
return -EINVAL;
@@ -369,8 +424,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
- unsigned int seals = memfd_file_seals(file);
unsigned long flags_mask;
+ int err;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
@@ -410,8 +465,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
- else if (is_readonly_sealed(seals, vm_flags))
- vm_flags &= ~VM_MAYWRITE;
fallthrough;
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
@@ -431,6 +484,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
default:
return -EINVAL;
}
+
+ /*
+ * Check to see if we are violating any seals and update VMA
+ * flags if necessary to avoid future seal violations.
+ */
+ err = memfd_check_seals_mmap(file, &vm_flags);
+ if (err)
+ return (unsigned long)err;
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
@@ -581,115 +642,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-/**
- * unmapped_area() - Find an area between the low_limit and the high_limit with
- * the correct alignment and offset, all from @info. Note: current->mm is used
- * for the search.
- *
- * @info: The unmapped area information including the range [low_limit -
- * high_limit), the alignment offset and mask.
- *
- * Return: A memory address or -ENOMEM.
- */
-static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
-{
- unsigned long length, gap;
- unsigned long low_limit, high_limit;
- struct vm_area_struct *tmp;
- VMA_ITERATOR(vmi, current->mm, 0);
-
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask + info->start_gap;
- if (length < info->length)
- return -ENOMEM;
-
- low_limit = info->low_limit;
- if (low_limit < mmap_min_addr)
- low_limit = mmap_min_addr;
- high_limit = info->high_limit;
-retry:
- if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
- return -ENOMEM;
-
- /*
- * Adjust for the gap first so it doesn't interfere with the
- * later alignment. The first step is the minimum needed to
- * fulill the start gap, the next steps is the minimum to align
- * that. It is the minimum needed to fulill both.
- */
- gap = vma_iter_addr(&vmi) + info->start_gap;
- gap += (info->align_offset - gap) & info->align_mask;
- tmp = vma_next(&vmi);
- if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) < gap + length - 1) {
- low_limit = tmp->vm_end;
- vma_iter_reset(&vmi);
- goto retry;
- }
- } else {
- tmp = vma_prev(&vmi);
- if (tmp && vm_end_gap(tmp) > gap) {
- low_limit = vm_end_gap(tmp);
- vma_iter_reset(&vmi);
- goto retry;
- }
- }
-
- return gap;
-}
-
-/**
- * unmapped_area_topdown() - Find an area between the low_limit and the
- * high_limit with the correct alignment and offset at the highest available
- * address, all from @info. Note: current->mm is used for the search.
- *
- * @info: The unmapped area information including the range [low_limit -
- * high_limit), the alignment offset and mask.
- *
- * Return: A memory address or -ENOMEM.
- */
-static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
-{
- unsigned long length, gap, gap_end;
- unsigned long low_limit, high_limit;
- struct vm_area_struct *tmp;
- VMA_ITERATOR(vmi, current->mm, 0);
-
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask + info->start_gap;
- if (length < info->length)
- return -ENOMEM;
-
- low_limit = info->low_limit;
- if (low_limit < mmap_min_addr)
- low_limit = mmap_min_addr;
- high_limit = info->high_limit;
-retry:
- if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
- return -ENOMEM;
-
- gap = vma_iter_end(&vmi) - info->length;
- gap -= (gap - info->align_offset) & info->align_mask;
- gap_end = vma_iter_end(&vmi);
- tmp = vma_next(&vmi);
- if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) < gap_end) {
- high_limit = vm_start_gap(tmp);
- vma_iter_reset(&vmi);
- goto retry;
- }
- } else {
- tmp = vma_prev(&vmi);
- if (tmp && vm_end_gap(tmp) > gap) {
- high_limit = tmp->vm_start;
- vma_iter_reset(&vmi);
- goto retry;
- }
- }
-
- return gap;
-}
-
/*
* Determine if the allocation needs to ensure that there is no
* existing mapping within it's guard gaps, for use as start_gap.
@@ -989,211 +941,6 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
return vma;
}
-/*
- * Verify that the stack growth is acceptable and
- * update accounting. This is shared with both the
- * grow-up and grow-down cases.
- */
-static int acct_stack_growth(struct vm_area_struct *vma,
- unsigned long size, unsigned long grow)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long new_start;
-
- /* address space limit tests */
- if (!may_expand_vm(mm, vma->vm_flags, grow))
- return -ENOMEM;
-
- /* Stack limit test */
- if (size > rlimit(RLIMIT_STACK))
- return -ENOMEM;
-
- /* mlock limit tests */
- if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
- return -ENOMEM;
-
- /* Check to ensure the stack will not grow into a hugetlb-only region */
- new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
- vma->vm_end - size;
- if (is_hugepage_only_range(vma->vm_mm, new_start, size))
- return -EFAULT;
-
- /*
- * Overcommit.. This must be the final test, as it will
- * update security statistics.
- */
- if (security_vm_enough_memory_mm(mm, grow))
- return -ENOMEM;
-
- return 0;
-}
-
-#if defined(CONFIG_STACK_GROWSUP)
-/*
- * PA-RISC uses this for its stack.
- * vma is the last one with address > vma->vm_end. Have to extend vma.
- */
-static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next;
- unsigned long gap_addr;
- int error = 0;
- VMA_ITERATOR(vmi, mm, vma->vm_start);
-
- if (!(vma->vm_flags & VM_GROWSUP))
- return -EFAULT;
-
- mmap_assert_write_locked(mm);
-
- /* Guard against exceeding limits of the address space. */
- address &= PAGE_MASK;
- if (address >= (TASK_SIZE & PAGE_MASK))
- return -ENOMEM;
- address += PAGE_SIZE;
-
- /* Enforce stack_guard_gap */
- gap_addr = address + stack_guard_gap;
-
- /* Guard against overflow */
- if (gap_addr < address || gap_addr > TASK_SIZE)
- gap_addr = TASK_SIZE;
-
- next = find_vma_intersection(mm, vma->vm_end, gap_addr);
- if (next && vma_is_accessible(next)) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- /* Check that both stack segments have the same anon_vma? */
- }
-
- if (next)
- vma_iter_prev_range_limit(&vmi, address);
-
- vma_iter_config(&vmi, vma->vm_start, address);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- /* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma))) {
- vma_iter_free(&vmi);
- return -ENOMEM;
- }
-
- /* Lock the VMA before expanding to prevent concurrent page faults */
- vma_start_write(vma);
- /* We update the anon VMA tree. */
- anon_vma_lock_write(vma->anon_vma);
-
- /* Somebody else might have raced and expanded it already */
- if (address > vma->vm_end) {
- unsigned long size, grow;
-
- size = address - vma->vm_start;
- grow = (address - vma->vm_end) >> PAGE_SHIFT;
-
- error = -ENOMEM;
- if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, grow);
- anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_end = address;
- /* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
- anon_vma_interval_tree_post_update_vma(vma);
-
- perf_event_mmap(vma);
- }
- }
- }
- anon_vma_unlock_write(vma->anon_vma);
- vma_iter_free(&vmi);
- validate_mm(mm);
- return error;
-}
-#endif /* CONFIG_STACK_GROWSUP */
-
-/*
- * vma is the first one with address < vma->vm_start. Have to extend vma.
- * mmap_lock held for writing.
- */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *prev;
- int error = 0;
- VMA_ITERATOR(vmi, mm, vma->vm_start);
-
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return -EFAULT;
-
- mmap_assert_write_locked(mm);
-
- address &= PAGE_MASK;
- if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
- return -EPERM;
-
- /* Enforce stack_guard_gap */
- prev = vma_prev(&vmi);
- /* Check that both stack segments have the same anon_vma? */
- if (prev) {
- if (!(prev->vm_flags & VM_GROWSDOWN) &&
- vma_is_accessible(prev) &&
- (address - prev->vm_end < stack_guard_gap))
- return -ENOMEM;
- }
-
- if (prev)
- vma_iter_next_range_limit(&vmi, vma->vm_start);
-
- vma_iter_config(&vmi, address, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- /* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma))) {
- vma_iter_free(&vmi);
- return -ENOMEM;
- }
-
- /* Lock the VMA before expanding to prevent concurrent page faults */
- vma_start_write(vma);
- /* We update the anon VMA tree. */
- anon_vma_lock_write(vma->anon_vma);
-
- /* Somebody else might have raced and expanded it already */
- if (address < vma->vm_start) {
- unsigned long size, grow;
-
- size = vma->vm_end - address;
- grow = (vma->vm_start - address) >> PAGE_SHIFT;
-
- error = -ENOMEM;
- if (grow <= vma->vm_pgoff) {
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags, grow);
- anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
- /* Overwrite old entry in mtree. */
- vma_iter_store(&vmi, vma);
- anon_vma_interval_tree_post_update_vma(vma);
-
- perf_event_mmap(vma);
- }
- }
- }
- anon_vma_unlock_write(vma->anon_vma);
- vma_iter_free(&vmi);
- validate_mm(mm);
- return error;
-}
-
/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
@@ -1325,58 +1072,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- unsigned long ret;
- bool writable_file_mapping = false;
-
- /* Check to see if MDWE is applicable. */
- if (map_deny_write_exec(vm_flags, vm_flags))
- return -EACCES;
-
- /* Allow architectures to sanity-check the vm_flags. */
- if (!arch_validate_flags(vm_flags))
- return -EINVAL;
-
- /* Map writable and ensure this isn't a sealed memfd. */
- if (file && is_shared_maywrite(vm_flags)) {
- int error = mapping_map_writable(file->f_mapping);
-
- if (error)
- return error;
- writable_file_mapping = true;
- }
-
- ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
-
- /* Clear our write mapping regardless of error. */
- if (writable_file_mapping)
- mapping_unmap_writable(file->f_mapping);
-
- validate_mm(current->mm);
- return ret;
-}
-
-static int __vm_munmap(unsigned long start, size_t len, bool unlock)
-{
- int ret;
- struct mm_struct *mm = current->mm;
- LIST_HEAD(uf);
- VMA_ITERATOR(vmi, mm, start);
-
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
- if (ret || !unlock)
- mmap_write_unlock(mm);
-
- userfaultfd_unmap_complete(mm, &uf);
- return ret;
-}
-
int vm_munmap(unsigned long start, size_t len)
{
return __vm_munmap(start, len, false);
@@ -1512,88 +1207,6 @@ out:
return ret;
}
-/*
- * do_brk_flags() - Increase the brk vma if the flags match.
- * @vmi: The vma iterator
- * @addr: The start address
- * @len: The length of the increase
- * @vma: The vma,
- * @flags: The VMA Flags
- *
- * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
- * do not match then create a new anonymous VMA. Eventually we may be able to
- * do some brk-specific accounting here.
- */
-static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, unsigned long len, unsigned long flags)
-{
- struct mm_struct *mm = current->mm;
-
- /*
- * Check against address space limits by the changed size
- * Note: This happens *after* clearing old mappings in some code paths.
- */
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
-
- if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- /*
- * Expand the existing vma if possible; Note that singular lists do not
- * occur after forking, so the expand will only happen on new VMAs.
- */
- if (vma && vma->vm_end == addr) {
- VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
-
- vmg.prev = vma;
- /* vmi is positioned at prev, which this mode expects. */
- vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
-
- if (vma_merge_new_range(&vmg))
- goto out;
- else if (vmg_nomem(&vmg))
- goto unacct_fail;
- }
-
- if (vma)
- vma_iter_next_range(vmi);
- /* create a vma struct for an anonymous mapping */
- vma = vm_area_alloc(mm);
- if (!vma)
- goto unacct_fail;
-
- vma_set_anonymous(vma);
- vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
- vm_flags_init(vma, flags);
- vma->vm_page_prot = vm_get_page_prot(flags);
- vma_start_write(vma);
- if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
- goto mas_store_fail;
-
- mm->map_count++;
- validate_mm(mm);
- ksm_add_vma(vma);
-out:
- perf_event_mmap(vma);
- mm->total_vm += len >> PAGE_SHIFT;
- mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
- vm_flags_set(vma, VM_SOFTDIRTY);
- return 0;
-
-mas_store_fail:
- vm_area_free(vma);
-unacct_fail:
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
-}
-
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
@@ -1664,7 +1277,6 @@ void exit_mmap(struct mm_struct *mm)
goto destroy;
}
- lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
@@ -2107,7 +1719,6 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
vma, new_start, length, false, true))
return -ENOMEM;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
next = vma_next(&vmi);
if (new_end > old_start) {
@@ -2132,3 +1743,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
/* Shrink the vma to just the new range */
return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
}
+
+#ifdef CONFIG_MMU
+/*
+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
+ * start of the VMA, the intent is to perform a write, and it is a
+ * downward-growing stack, then attempt to expand the stack to contain it.
+ *
+ * This function is intended only for obtaining an argument page from an ELF
+ * image, and is almost certainly NOT what you want to use for any other
+ * purpose.
+ *
+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
+ * new VMA being mapped.
+ *
+ * The function assumes that addr is either contained within the VMA or below
+ * it, and makes no attempt to validate this value beyond that.
+ *
+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
+ * false if the stack expansion failed.
+ *
+ * On stack expansion the function temporarily acquires an mmap write lock
+ * before downgrading it.
+ */
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
+ struct vm_area_struct *new_vma,
+ unsigned long addr, bool write)
+{
+ if (!write || addr >= new_vma->vm_start) {
+ mmap_read_lock(mm);
+ return true;
+ }
+
+ if (!(new_vma->vm_flags & VM_GROWSDOWN))
+ return false;
+
+ mmap_write_lock(mm);
+ if (expand_downwards(new_vma, addr)) {
+ mmap_write_unlock(mm);
+ return false;
+ }
+
+ mmap_write_downgrade(mm);
+ return true;
+}
+#else
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, bool write)
+{
+ return false;
+}
+#endif
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index f186d57df2c6..e7dbaf96aa17 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
-#ifdef CONFIG_MEMCG
-
-/*
- * Size of the buffer for memcg path names. Ignoring stack trace support,
- * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
- */
-#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- if (trace_mmap_lock_##type##_enabled()) { \
- char buf[MEMCG_PATH_BUF_SIZE]; \
- get_mm_memcg_path(mm, buf, sizeof(buf)); \
- trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
- } \
- } while (0)
-
-#else /* !CONFIG_MEMCG */
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
-#endif /* CONFIG_MEMCG */
-
#ifdef CONFIG_TRACING
-#ifdef CONFIG_MEMCG
-/*
- * Write the given mm_struct's memcg path to a buffer. If the path cannot be
- * determined, empty string is written.
- */
-static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
-{
- struct mem_cgroup *memcg;
-
- buf[0] = '\0';
- memcg = get_mem_cgroup_from_mm(mm);
- if (memcg == NULL)
- return;
- if (memcg->css.cgroup)
- cgroup_path(memcg->css.cgroup, buf, buflen);
- css_put(&memcg->css);
-}
-
-#endif /* CONFIG_MEMCG */
-
/*
* Trace calls must be in a separate file, as otherwise there's a circular
* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
@@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
- TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+ trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
bool success)
{
- TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+ trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
- TRACE_MMAP_LOCK_EVENT(released, mm, write);
+ trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 99b3e9408aa0..7aa6f18c500b 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -311,11 +311,34 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb)
}
}
-static void tlb_remove_table_one(void *table)
+#ifdef CONFIG_PT_RECLAIM
+static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ __tlb_remove_table(ptdesc);
+}
+
+static inline void __tlb_remove_table_one(void *table)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = table;
+ call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
+}
+#else
+static inline void __tlb_remove_table_one(void *table)
{
tlb_remove_table_sync_one();
__tlb_remove_table(table);
}
+#endif /* CONFIG_PT_RECLAIM */
+
+static void tlb_remove_table_one(void *table)
+{
+ __tlb_remove_table_one(table);
+}
static void tlb_table_flush(struct mmu_gather *tlb)
{
diff --git a/mm/mseal.c b/mm/mseal.c
index 81d6e980e8a9..c27197ac04e8 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
unsigned long end;
struct mm_struct *mm = current->mm;
- ret = can_do_mseal(flags);
- if (ret)
- return ret;
+ /* Verify flags not set. */
+ if (flags)
+ return -EINVAL;
start = untagged_addr(start);
if (!PAGE_ALIGNED(start))
diff --git a/mm/numa.c b/mm/numa.c
index e2eec07707d1..f1787d7713a6 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -37,13 +37,7 @@ void __init alloc_node_data(int nid)
void __init alloc_offline_node_data(int nid)
{
pg_data_t *pgdat;
-
- pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);
- if (!pgdat)
- panic("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
-
- node_data[nid] = pgdat;
+ node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES);
}
/* Stub functions: */
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 031fb9961bf7..9d55679d99ce 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -8,11 +8,12 @@
#include <linux/memblock.h>
#include <linux/numa_memblks.h>
#include <asm/numa.h>
+#include <acpi/acpi_numa.h>
#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-static int emu_nid_to_phys[MAX_NUMNODES];
+int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
int __init numa_emu_cmdline(char *str)
@@ -379,6 +380,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
int i, j, ret;
+ nodemask_t physnode_mask = numa_nodes_parsed;
if (!emu_cmdline)
goto no_emu;
@@ -395,7 +397,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
* split the system RAM into N fake nodes.
*/
if (strchr(emu_cmdline, 'U')) {
- nodemask_t physnode_mask = numa_nodes_parsed;
unsigned long n;
int nid = 0;
@@ -465,9 +466,6 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
*/
max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
- /* commit */
- *numa_meminfo = ei;
-
/* Make sure numa_nodes_parsed only contains emulated nodes */
nodes_clear(numa_nodes_parsed);
for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
@@ -475,10 +473,21 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
ei.blk[i].nid != NUMA_NO_NODE)
node_set(ei.blk[i].nid, numa_nodes_parsed);
- numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys));
+ /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision
+ * with faked numa nodes, particularly during later memory hotplug
+ * handling, and also update numa_nodes_parsed accordingly.
+ */
+ ret = fix_pxm_node_maps(max_emu_nid);
+ if (ret < 0)
+ goto no_emu;
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1);
/* make sure all emulated nodes are mapped to a physical node */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ for (i = 0; i < max_emu_nid + 1; i++)
if (emu_nid_to_phys[i] == NUMA_NO_NODE)
emu_nid_to_phys[i] = dfl_phys_nid;
@@ -501,12 +510,34 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
numa_set_distance(i, j, dist);
}
}
+ for (i = 0; i < numa_distance_cnt; i++) {
+ for (j = 0; j < numa_distance_cnt; j++) {
+ int physi, physj;
+ u8 dist;
+
+ /* distance between fake nodes is already ok */
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE &&
+ emu_nid_to_phys[j] != NUMA_NO_NODE)
+ continue;
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE)
+ physi = emu_nid_to_phys[i];
+ else
+ physi = i - max_emu_nid;
+ if (emu_nid_to_phys[j] != NUMA_NO_NODE)
+ physj = emu_nid_to_phys[j];
+ else
+ physj = j - max_emu_nid;
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+ numa_set_distance(i, j, dist);
+ }
+ }
/* free the copied physical distance table */
memblock_free(phys_dist, phys_size);
return;
no_emu:
+ numa_nodes_parsed = physnode_mask;
/* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
emu_nid_to_phys[i] = i;
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index a3877e9bc878..ff4054f4334d 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -7,7 +7,7 @@
#include <linux/numa.h>
#include <linux/numa_memblks.h>
-static int numa_distance_cnt;
+int numa_distance_cnt;
static u8 *numa_distance;
nodemask_t numa_nodes_parsed __initdata;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..1cf121ad7085 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
@@ -699,7 +705,7 @@ static void queue_oom_reaper(struct task_struct *tsk)
}
#ifdef CONFIG_SYSCTL
-static struct ctl_table vm_oom_kill_table[] = {
+static const struct ctl_table vm_oom_kill_table[] = {
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d9861e42b2bd..eb55ece39c56 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -942,26 +942,25 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
- wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
- if (wb_thresh > wb_max_thresh)
- wb_thresh = wb_max_thresh;
/*
- * With strictlimit flag, the wb_thresh is treated as
- * a hard limit in balance_dirty_pages() and wb_position_ratio().
- * It's possible that wb_thresh is close to zero, not because
- * the device is slow, but because it has been inactive.
- * To prevent occasional writes from being blocked, we raise wb_thresh.
+ * It's very possible that wb_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
*/
- if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
- u64 wb_scale_thresh = 0;
-
- if (limit > dtc->dirty)
- wb_scale_thresh = (limit - dtc->dirty) / 100;
- wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4));
+ if (thresh > dtc->dirty) {
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
+ else
+ wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
}
+ wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
+ if (wb_thresh > wb_max_thresh)
+ wb_thresh = wb_max_thresh;
+
return wb_thresh;
}
@@ -969,6 +968,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ domain_dirty_avail(&gdtc, true);
return __wb_calc_thresh(&gdtc, thresh);
}
@@ -1145,12 +1145,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
long long wb_pos_ratio;
- if (dtc->wb_dirty < 8) {
- dtc->pos_ratio = min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
- return;
- }
-
if (dtc->wb_dirty >= wb_thresh)
return;
@@ -1222,14 +1216,6 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
if (unlikely(wb_thresh > dtc->thresh))
wb_thresh = dtc->thresh;
/*
- * It's very possible that wb_thresh is close to 0 not because the
- * device is slow, but that it has remained inactive for long time.
- * Honour such devices a reasonable good (hopefully IO efficient)
- * threshold, so that the occasional writes won't be blocked and active
- * writes can rampup the threshold quickly.
- */
- wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
- /*
* scale global setpoint to wb's:
* wb_setpoint = setpoint * wb_thresh / thresh
*/
@@ -1484,17 +1470,10 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
* Hence, to calculate "step" properly, we have to use wb_dirty as
* "dirty" and wb_setpoint as "setpoint".
- *
- * We rampup dirty_ratelimit forcibly if wb_dirty is low because
- * it's possible that wb_thresh is close to zero due to inactivity
- * of backing device.
*/
if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
dirty = dtc->wb_dirty;
- if (dtc->wb_dirty < 8)
- setpoint = dtc->wb_dirty + 1;
- else
- setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
@@ -2319,7 +2298,7 @@ static int page_writeback_cpu_online(unsigned int cpu)
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-static struct ctl_table vm_page_writeback_sysctls[] = {
+static const struct ctl_table vm_page_writeback_sysctls[] = {
{
.procname = "dirty_background_ratio",
.data = &dirty_background_ratio,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 01eab25edf89..542d25f77be8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1295,12 +1295,6 @@ void __meminit __free_pages_core(struct page *page, unsigned int order,
set_page_count(p, 0);
}
- /*
- * Freeing the page with debug_pagealloc enabled will try to
- * unmap it; some archs don't like double-unmappings, so
- * map it first.
- */
- debug_pagealloc_map_pages(page, nr_pages);
adjust_managed_page_count(page, nr_pages);
} else {
for (loop = 0; loop < nr_pages; loop++, p++) {
@@ -1508,7 +1502,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
int i;
set_page_private(page, 0);
- set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
@@ -1856,6 +1849,14 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
if (order >= pageblock_order)
return true;
+ /*
+ * Movable pages won't cause permanent fragmentation, so when you alloc
+ * small pages, you just need to temporarily steal unmovable or
+ * reclaimable pages that are closest to the request size. After a
+ * while, memory compaction may occur to form large contiguous pages,
+ * and the next movable allocation may not need to steal. Unmovable and
+ * reclaimable allocations need to actually steal pages.
+ */
if (order >= pageblock_order / 2 ||
start_mt == MIGRATE_RECLAIMABLE ||
start_mt == MIGRATE_UNMOVABLE ||
@@ -2592,9 +2593,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
- struct page *page, int migratetype,
- unsigned int order)
+static void free_frozen_page_commit(struct zone *zone,
+ struct per_cpu_pages *pcp, struct page *page, int migratetype,
+ unsigned int order)
{
int high, batch;
int pindex;
@@ -2643,7 +2644,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
/*
* Free a pcp page
*/
-void free_unref_page(struct page *page, unsigned int order)
+void free_frozen_pages(struct page *page, unsigned int order)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2666,20 +2667,20 @@ void free_unref_page(struct page *page, unsigned int order)
* get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
+ zone = page_zone(page);
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
}
- zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order);
pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, FPI_NONE);
@@ -2743,7 +2744,7 @@ void free_unref_folios(struct folio_batch *folios)
/*
* Free isolated pages directly to the
- * allocator, see comment in free_unref_page.
+ * allocator, see comment in free_frozen_pages.
*/
if (is_migrate_isolate(migratetype)) {
free_one_page(zone, &folio->page, pfn,
@@ -2774,7 +2775,7 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_unref_page_commit(zone, pcp, &folio->page, migratetype,
+ free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
order);
}
@@ -3567,7 +3568,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
if (!page)
page = get_page_from_freelist(gfp_mask, order,
alloc_flags, ac);
-
return page;
}
@@ -4243,6 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -4531,28 +4532,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
* @gfp: GFP flags for the allocation
* @preferred_nid: The preferred NUMA node ID to allocate from
* @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list or array
- * @page_list: Optional list to store the allocated pages
- * @page_array: Optional array to store the pages
+ * @nr_pages: The number of pages desired in the array
+ * @page_array: Array to store the pages
*
* This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to page_list if page_list
- * is not NULL, otherwise it is assumed that the page_array is valid.
- *
- * For lists, nr_pages is the number of pages that should be allocated.
+ * allocate nr_pages quickly. Pages are added to the page_array.
*
- * For arrays, only NULL elements are populated with pages and nr_pages
+ * Note that only NULL elements are populated with pages and nr_pages
* is the maximum number of pages that will be stored in the array.
*
- * Returns the number of pages on the list or array.
+ * Returns the number of pages in the array.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array)
{
struct page *page;
@@ -4570,7 +4566,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ while (nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
/* No pages requested? */
@@ -4578,7 +4574,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto out;
/* Already populated array? */
- if (unlikely(page_array && nr_pages - nr_populated == 0))
+ if (unlikely(nr_pages - nr_populated == 0))
goto out;
/* Bulk allocator does not support memcg accounting. */
@@ -4660,7 +4656,7 @@ retry_this_zone:
while (nr_populated < nr_pages) {
/* Skip existing pages */
- if (page_array && page_array[nr_populated]) {
+ if (page_array[nr_populated]) {
nr_populated++;
continue;
}
@@ -4678,11 +4674,8 @@ retry_this_zone:
nr_account++;
prep_new_page(page, 0, gfp, 0);
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
+ set_page_refcounted(page);
+ page_array[nr_populated++] = page;
}
pcp_spin_unlock(pcp);
@@ -4699,14 +4692,8 @@ failed_irq:
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- if (page) {
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
- }
-
+ if (page)
+ page_array[nr_populated++] = page;
goto out;
}
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
@@ -4714,8 +4701,8 @@ EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
- int preferred_nid, nodemask_t *nodemask)
+struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4768,7 +4755,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
out:
if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
- __free_pages(page, order);
+ free_frozen_pages(page, order);
page = NULL;
}
@@ -4777,6 +4764,18 @@ out:
return page;
}
+EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
+
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+ int preferred_nid, nodemask_t *nodemask)
+{
+ struct page *page;
+
+ page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
EXPORT_SYMBOL(__alloc_pages_noprof);
struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
@@ -4837,11 +4836,11 @@ void __free_pages(struct page *page, unsigned int order)
struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
- free_unref_page(page, order);
+ free_frozen_pages(page, order);
else if (!head) {
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
- free_unref_page(page + (1 << order), order);
+ free_frozen_pages(page + (1 << order), order);
}
}
EXPORT_SYMBOL(__free_pages);
@@ -5161,13 +5160,6 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
zonerefs->zone_idx = 0;
}
-/*
- * Build zonelists ordered by zone and nodes within zones.
- * This results in conserving DMA zone[s] until all Normal memory is
- * exhausted, but results in overflowing to remote node while memory
- * may still exist in local DMA zone.
- */
-
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
@@ -5858,11 +5850,10 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
@@ -6175,7 +6166,7 @@ out:
return ret;
}
-static struct ctl_table page_alloc_sysctl_table[] = {
+static const struct ctl_table page_alloc_sysctl_table[] = {
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -6270,9 +6261,8 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
* @migratetype: using migratetype to filter the type of migration in
* trace_mm_alloc_contig_migrate_range_info.
*/
-int __alloc_contig_migrate_range(struct compact_control *cc,
- unsigned long start, unsigned long end,
- int migratetype)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end, int migratetype)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned int nr_reclaimed;
@@ -6281,7 +6271,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
int ret = 0;
struct migration_target_control mtc = {
.nid = zone_to_nid(cc->zone),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ .gfp_mask = cc->gfp_mask,
.reason = MR_CONTIG_RANGE,
};
struct page *page;
@@ -6351,7 +6341,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
return (ret < 0) ? ret : 0;
}
-static void split_free_pages(struct list_head *list)
+static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
{
int order;
@@ -6362,7 +6352,8 @@ static void split_free_pages(struct list_head *list)
list_for_each_entry_safe(page, next, &list[order], lru) {
int i;
- post_alloc_hook(page, order, __GFP_MOVABLE);
+ post_alloc_hook(page, order, gfp_mask);
+ set_page_refcounted(page);
if (!order)
continue;
@@ -6376,6 +6367,40 @@ static void split_free_pages(struct list_head *list)
}
}
+static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
+{
+ const gfp_t reclaim_mask = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ const gfp_t action_mask = __GFP_COMP | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+ __GFP_ZERO | __GFP_ZEROTAGS | __GFP_SKIP_ZERO;
+ const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+
+ /*
+ * We are given the range to allocate; node, mobility and placement
+ * hints are irrelevant at this point. We'll simply ignore them.
+ */
+ gfp_mask &= ~(GFP_ZONEMASK | __GFP_RECLAIMABLE | __GFP_WRITE |
+ __GFP_HARDWALL | __GFP_THISNODE | __GFP_MOVABLE);
+
+ /*
+ * We only support most reclaim flags (but not NOFAIL/NORETRY), and
+ * selected action flags.
+ */
+ if (gfp_mask & ~(reclaim_mask | action_mask))
+ return -EINVAL;
+
+ /*
+ * Flags to control page compaction/migration/reclaim, to free up our
+ * page range. Migratable pages are movable, __GFP_MOVABLE is implied
+ * for them.
+ *
+ * Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
+ * to not degrade callers.
+ */
+ *gfp_cc_mask = (gfp_mask & (reclaim_mask | cc_action_mask)) |
+ __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+ return 0;
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -6384,7 +6409,9 @@ static void split_free_pages(struct list_head *list)
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
- * @gfp_mask: GFP mask to use during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
*
* The PFN range does not have to be pageblock aligned. The PFN range must
* belong to a single zone.
@@ -6410,11 +6437,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.no_set_skip_hint = true,
- .gfp_mask = current_gfp_context(gfp_mask),
.alloc_contig = true,
};
INIT_LIST_HEAD(&cc.migratepages);
+ gfp_mask = current_gfp_context(gfp_mask);
+ if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
+ return -EINVAL;
+
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -6436,7 +6466,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
+ ret = start_isolate_page_range(start, end, migratetype, 0);
if (ret)
goto done;
@@ -6455,7 +6485,17 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
if (ret && ret != -EBUSY)
goto done;
- ret = 0;
+
+ /*
+ * When in-use hugetlb pages are migrated, they may simply be released
+ * back into the free hugepage pool instead of being returned to the
+ * buddy system. After the migration of in-use huge pages is completed,
+ * we will invoke replace_free_hugepage_folios() to ensure that these
+ * hugepages are properly released to the buddy system.
+ */
+ ret = replace_free_hugepage_folios(start, end);
+ if (ret)
+ goto done;
/*
* Pages from [start, end) are within a pageblock_nr_pages
@@ -6489,7 +6529,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
}
if (!(gfp_mask & __GFP_COMP)) {
- split_free_pages(cc.freepages);
+ split_free_pages(cc.freepages, gfp_mask);
/* Free head and tail (if any) */
if (start != outer_start)
@@ -6502,6 +6542,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
+ set_page_refcounted(head);
} else {
ret = -EINVAL;
WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
@@ -6556,7 +6597,9 @@ static bool zone_spans_last_pfn(const struct zone *zone,
/**
* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
* @nr_pages: Number of contiguous pages to allocate
- * @gfp_mask: GFP mask to limit search and used during compaction
+ * @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
+ * action and reclaim modifiers are supported. Reclaim modifiers
+ * control allocation behavior during compaction/migration/reclaim.
* @nid: Target node
* @nodemask: Mask for other possible nodes
*
@@ -6961,7 +7004,7 @@ static inline bool has_unaccepted_memory(void)
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
- long to_accept;
+ long to_accept, wmark;
bool ret = false;
if (!has_unaccepted_memory())
@@ -6970,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
if (list_empty(&zone->unaccepted_pages))
return false;
+ wmark = promo_wmark_pages(zone);
+
+ /*
+ * Watermarks have not been initialized yet.
+ *
+ * Accepting one MAX_ORDER page to ensure progress.
+ */
+ if (!wmark)
+ return try_to_accept_memory_one(zone);
+
/* How much to accept to get to promo watermark? */
- to_accept = promo_wmark_pages(zone) -
+ to_accept = wmark -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b249d15af9dd..af23f927611b 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -288,7 +288,7 @@ int page_counter_memparse(const char *buf, const char *max,
}
-#ifdef CONFIG_MEMCG
+#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
* This function calculates an individual page counter's effective
* protection which is derived from its own memory.min/low, its
@@ -460,4 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root,
atomic_long_read(&parent->children_low_usage),
recursive_protection));
}
-#endif /* CONFIG_MEMCG */
+#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index 3f7a203d35c6..d2423f30577e 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
- free_unref_page(page, compound_order(page));
+ free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -138,7 +138,7 @@ refill:
goto refill;
if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
- free_unref_page(page,
+ free_frozen_pages(page,
encoded_page_decode_order(encoded_page));
goto refill;
}
@@ -166,6 +166,6 @@ void page_frag_free(void *addr)
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- free_unref_page(page, compound_order(page));
+ free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 41ea77f22011..947c7c7a3728 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -112,7 +112,7 @@ static void page_idle_clear_pte_refs(struct folio *folio)
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
@@ -157,7 +157,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
}
static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
- struct bin_attribute *attr, char *buf,
+ const struct bin_attribute *attr, char *buf,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
@@ -193,17 +193,17 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
return (char *)in - buf;
}
-static struct bin_attribute page_idle_bitmap_attr =
+static const struct bin_attribute page_idle_bitmap_attr =
__BIN_ATTR(bitmap, 0600,
page_idle_bitmap_read, page_idle_bitmap_write, 0);
-static struct bin_attribute *page_idle_bin_attrs[] = {
+static const struct bin_attribute *const page_idle_bin_attrs[] = {
&page_idle_bitmap_attr,
NULL,
};
static const struct attribute_group page_idle_attr_group = {
- .bin_attrs = page_idle_bin_attrs,
+ .bin_attrs_new = page_idle_bin_attrs,
.name = "page_idle",
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 4b4ea8e49cf6..9b983de351f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -163,7 +163,6 @@ reprobe:
page_no = 1; /* force Empty message */
sis->max = page_no;
sis->pages = page_no - 1;
- sis->highest_bit = page_no - 1;
out:
return ret;
bad_bmap:
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7e04047977cf..a051a29e95ad 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -286,7 +286,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* within a free or in-use page.
* @boundary_pfn: pageblock-aligned pfn that a page might cross
* @flags: isolation flags
- * @gfp_flags: GFP flags used for migrating pages
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
@@ -306,8 +305,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
- int migratetype)
+ bool isolate_before, bool skip_isolation, int migratetype)
{
unsigned long start_pfn;
unsigned long isolate_pageblock;
@@ -444,8 +442,6 @@ failed:
* and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
- * @gfp_flags: GFP flags used for migrating pages that sit across the
- * range boundaries.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -478,7 +474,7 @@ failed:
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- int migratetype, int flags, gfp_t gfp_flags)
+ int migratetype, int flags)
{
unsigned long pfn;
struct page *page;
@@ -489,7 +485,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ ret = isolate_single_pageblock(isolate_start, flags, false,
skip_isolation, migratetype);
if (ret)
return ret;
@@ -498,7 +494,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ ret = isolate_single_pageblock(isolate_end, flags, true,
skip_isolation, migratetype);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
@@ -612,6 +608,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
diff --git a/mm/percpu.c b/mm/percpu.c
index d8dd31a2e407..7b5835356d1e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
/* allocate chunk */
alloc_size = struct_size(chunk, populated,
BITS_TO_LONGS(region_size >> PAGE_SHIFT));
- chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
INIT_LIST_HEAD(&chunk->list);
@@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
region_bits = pcpu_chunk_map_bits(chunk);
alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
- chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->alloc_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size =
BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
- chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->bound_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
- chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!chunk->md_blocks)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
-
+ chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
/* first chunk is free to use */
chunk->obj_exts = NULL;
@@ -2595,28 +2582,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* process group information and build config tables accordingly */
alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
- group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!group_offsets)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
- group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!group_sizes)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
- unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!unit_map)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
- unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
- if (!unit_off)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- alloc_size);
+ unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -2685,12 +2660,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_free_slot = pcpu_sidelined_slot + 1;
pcpu_to_depopulate_slot = pcpu_free_slot + 1;
pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
- pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
sizeof(pcpu_chunk_lists[0]),
SMP_CACHE_BYTES);
- if (!pcpu_chunk_lists)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
@@ -3099,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3155,25 +3127,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
pmd_t *pmd;
if (pgd_none(*pgd)) {
- p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
- if (!p4d)
- goto err_alloc;
+ p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
pgd_populate(&init_mm, pgd, p4d);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
- pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
- if (!pud)
- goto err_alloc;
+ pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
p4d_populate(&init_mm, p4d, pud);
}
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
- pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
- if (!pmd)
- goto err_alloc;
+ pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
pud_populate(&init_mm, pud, pmd);
}
@@ -3181,16 +3147,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr)
if (!pmd_present(*pmd)) {
pte_t *new;
- new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
- if (!new)
- goto err_alloc;
+ new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
pmd_populate_kernel(&init_mm, pmd, new);
}
return;
-
-err_alloc:
- panic("%s: Failed to allocate memory\n", __func__);
}
/**
@@ -3237,10 +3198,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
- if (!pages)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- pages_size);
+ pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);
/* allocate pages */
j = 0;
@@ -3282,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
new file mode 100644
index 000000000000..7e9455a18aae
--- /dev/null
+++ b/mm/pt_reclaim.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hugetlb.h>
+#include <asm-generic/tlb.h>
+#include <asm/pgalloc.h>
+
+#include "internal.h"
+
+bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
+ struct zap_details *details)
+{
+ return details && details->reclaim_pt && (end - start >= PMD_SIZE);
+}
+
+bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval)
+{
+ spinlock_t *pml = pmd_lockptr(mm, pmd);
+
+ if (!spin_trylock(pml))
+ return false;
+
+ *pmdval = pmdp_get_lockless(pmd);
+ pmd_clear(pmd);
+ spin_unlock(pml);
+
+ return true;
+}
+
+void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
+ pmd_t pmdval)
+{
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+}
+
+void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
+ struct mmu_gather *tlb)
+{
+ pmd_t pmdval;
+ spinlock_t *pml, *ptl = NULL;
+ pte_t *start_pte, *pte;
+ int i;
+
+ pml = pmd_lock(mm, pmd);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl);
+ if (!start_pte)
+ goto out_ptl;
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ /* Check if it is empty PTE page */
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ free_pte(mm, addr, tlb, pmdval);
+
+ return;
+out_ptl:
+ if (start_pte)
+ pte_unmap_unlock(start_pte, ptl);
+ if (ptl != pml)
+ spin_unlock(pml);
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index e151f4b13ca4..6a4e96b69702 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,20 +158,10 @@ static void read_pages(struct readahead_control *rac)
if (aops->readahead) {
aops->readahead(rac);
- /*
- * Clean up the remaining folios. The sizes in ->ra
- * may be used to size the next readahead, so make sure
- * they accurately reflect what happened.
- */
+ /* Clean up the remaining folios. */
while ((folio = readahead_folio(rac)) != NULL) {
- unsigned long nr = folio_nr_pages(folio);
-
folio_get(folio);
- rac->ra->size -= nr;
- if (rac->ra->async_size >= nr) {
- rac->ra->async_size -= nr;
- filemap_remove_folio(folio);
- }
+ filemap_remove_folio(folio);
folio_unlock(folio);
folio_put(folio);
}
@@ -188,6 +178,18 @@ static void read_pages(struct readahead_control *rac)
BUG_ON(readahead_count(rac));
}
+static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
+ gfp_t gfp_mask, unsigned int order)
+{
+ struct folio *folio;
+
+ folio = filemap_alloc_folio(gfp_mask, order);
+ if (folio && ractl->dropbehind)
+ __folio_set_dropbehind(folio);
+
+ return folio;
+}
+
/**
* page_cache_ra_unbounded - Start unchecked readahead.
* @ractl: Readahead control.
@@ -265,8 +267,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
continue;
}
- folio = filemap_alloc_folio(gfp_mask,
- mapping_min_folio_order(mapping));
+ folio = ractl_alloc_folio(ractl, gfp_mask,
+ mapping_min_folio_order(mapping));
if (!folio)
break;
@@ -436,7 +438,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
int err;
- struct folio *folio = filemap_alloc_folio(gfp, order);
+ struct folio *folio = ractl_alloc_folio(ractl, gfp, order);
if (!folio)
return -ENOMEM;
@@ -458,7 +460,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
struct file_ra_state *ra, unsigned int new_order)
{
struct address_space *mapping = ractl->mapping;
- pgoff_t index = readahead_index(ractl);
+ pgoff_t start = readahead_index(ractl);
+ pgoff_t index = start;
unsigned int min_order = mapping_min_folio_order(mapping);
pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
pgoff_t mark = index + ra->size - ra->async_size;
@@ -516,12 +519,18 @@ void page_cache_ra_order(struct readahead_control *ractl,
/*
* If there were already pages in the page cache, then we may have
* left some gaps. Let the regular readahead code take care of this
- * situation.
+ * situation below.
*/
if (!err)
return;
fallback:
- do_page_cache_ra(ractl, ra->size, ra->async_size);
+ /*
+ * ->readahead() may have updated readahead window size so we have to
+ * check there's still something to read.
+ */
+ if (ra->size > index - start)
+ do_page_cache_ra(ractl, ra->size - (index - start),
+ ra->async_size);
}
static unsigned long ractl_max_pages(struct readahead_control *ractl,
@@ -754,7 +763,7 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, min_order);
+ folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
if (!folio)
return;
@@ -783,7 +792,7 @@ void readahead_expand(struct readahead_control *ractl,
if (folio && !xa_is_value(folio))
return; /* Folio apparently present */
- folio = filemap_alloc_folio(gfp_mask, min_order);
+ folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
if (!folio)
return;
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 6d783436951f..e7173fcd210c 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -12,7 +12,8 @@
#include <linux/mm.h>
#include <asm/sections.h>
-static const int rodata_test_data = 0xC3;
+#define TEST_VALUE 0xC3
+static const int rodata_test_data = TEST_VALUE;
void rodata_test(void)
{
@@ -20,7 +21,7 @@ void rodata_test(void)
/* test 1: read the value */
/* If this test fails, some previous testrun has clobbered the state */
- if (!rodata_test_data) {
+ if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) {
pr_err("test 1 fails (start data)\n");
return;
}
@@ -33,7 +34,7 @@ void rodata_test(void)
}
/* test 3: check the value hasn't changed */
- if (rodata_test_data == zero) {
+ if (unlikely(READ_ONCE(rodata_test_data) != TEST_VALUE)) {
pr_err("test data was changed\n");
return;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 399552814fd0..1b0a214ee558 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -195,14 +195,13 @@ static struct file *secretmem_file_create(unsigned long flags)
struct file *file;
struct inode *inode;
const char *anon_name = "[secretmem]";
- const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
int err;
inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
if (IS_ERR(inode))
return ERR_CAST(inode);
- err = security_inode_init_security_anon(inode, &qname, NULL);
+ err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
if (err) {
file = ERR_PTR(err);
goto err_free_inode;
diff --git a/mm/shmem.c b/mm/shmem.c
index 532afd8e049c..ab61c8bb20e1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -553,38 +553,105 @@ static bool shmem_confirm_swap(struct address_space *mapping,
/* ifdef here to avoid bloating shmem.o when not necessary */
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+/**
+ * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
+ * @mapping: Target address_space.
+ * @index: The page index.
+ * @write_end: end of a write, could extend inode size.
+ *
+ * This returns huge orders for folios (when supported) based on the file size
+ * which the mapping currently allows at the given index. The index is relevant
+ * due to alignment considerations the mapping might have. The returned order
+ * may be less than the size passed.
+ *
+ * Return: The orders.
+ */
+static inline unsigned int
+shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
+{
+ unsigned int order;
+ size_t size;
+
+ if (!mapping_large_folio_support(mapping) || !write_end)
+ return 0;
+
+ /* Calculate the write size based on the write_end */
+ size = write_end - (index << PAGE_SHIFT);
+ order = filemap_get_order(size);
+ if (!order)
+ return 0;
+
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
+
+ order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
+ return order > 0 ? BIT(order + 1) - 1 : 0;
+}
+
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
+ unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
+ 0 : BIT(HPAGE_PMD_ORDER);
+ unsigned long within_size_orders;
+ unsigned int order;
+ pgoff_t aligned_index;
loff_t i_size;
- if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
- return false;
if (!S_ISREG(inode->i_mode))
- return false;
+ return 0;
if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
+ return 0;
if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
- return true;
+ return maybe_pmd_order;
+ /*
+ * The huge order allocation for anon shmem is controlled through
+ * the mTHP interface, so we still use PMD-sized huge order to
+ * check whether global control is enabled.
+ *
+ * For tmpfs mmap()'s huge order, we still use PMD-sized order to
+ * allocate huge pages due to lack of a write size hint.
+ *
+ * Otherwise, tmpfs will allow getting a highest order hint based on
+ * the size of write and fallocate paths, then will try each allowable
+ * huge orders.
+ */
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
- return true;
+ if (vma)
+ return maybe_pmd_order;
+
+ return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
case SHMEM_HUGE_WITHIN_SIZE:
- index = round_up(index + 1, HPAGE_PMD_NR);
- i_size = max(write_end, i_size_read(inode));
- i_size = round_up(i_size, PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index)
- return true;
+ if (vma)
+ within_size_orders = maybe_pmd_order;
+ else
+ within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
+ index, write_end);
+
+ order = highest_order(within_size_orders);
+ while (within_size_orders) {
+ aligned_index = round_up(index + 1, 1 << order);
+ i_size = max(write_end, i_size_read(inode));
+ i_size = round_up(i_size, PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= aligned_index)
+ return within_size_orders;
+
+ order = next_order(&within_size_orders, order);
+ }
fallthrough;
case SHMEM_HUGE_ADVISE:
if (vm_flags & VM_HUGEPAGE)
- return true;
+ return maybe_pmd_order;
fallthrough;
default:
- return false;
+ return 0;
}
}
@@ -779,11 +846,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
return 0;
}
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
- return false;
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -1180,7 +1248,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1480,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -1690,22 +1758,18 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
unsigned long vm_flags = vma ? vma->vm_flags : 0;
pgoff_t aligned_index;
- bool global_huge;
+ unsigned int global_orders;
loff_t i_size;
int order;
if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
- global_huge = shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vm_flags);
- if (!vma || !vma_is_anon_shmem(vma)) {
- /*
- * For tmpfs, we now only support PMD sized THP if huge page
- * is enabled, otherwise fallback to order 0.
- */
- return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
- }
+ global_orders = shmem_huge_global_enabled(inode, index, write_end,
+ shmem_huge_force, vma, vm_flags);
+ /* Tmpfs huge pages allocation */
+ if (!vma || !vma_is_anon_shmem(vma))
+ return global_orders;
/*
* Following the 'deny' semantics of the top level, force the huge
@@ -1737,7 +1801,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
- if (global_huge)
+ if (global_orders > 0)
mask |= READ_ONCE(huge_shmem_orders_inherit);
return THP_ORDERS_ALL_FILE_DEFAULT & mask;
@@ -1903,6 +1967,65 @@ unlock:
return ERR_PTR(error);
}
+static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+ struct vm_area_struct *vma, pgoff_t index,
+ swp_entry_t entry, int order, gfp_t gfp)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct folio *new;
+ void *shadow;
+ int nr_pages;
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success with further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
+ gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+
+ gfp = limit_gfp_mask(huge_gfp, gfp);
+ }
+
+ new = shmem_alloc_folio(gfp, order, info, index);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ nr_pages = folio_nr_pages(new);
+ if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
+ gfp, entry)) {
+ folio_put(new);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * Prevent parallel swapin from proceeding with the swap cache flag.
+ *
+ * Of course there is another possible concurrent scenario as well,
+ * that is to say, the swap cache flag of a large folio has already
+ * been set by swapcache_prepare(), while another thread may have
+ * already split the large swap entry stored in the shmem mapping.
+ * In this case, shmem_add_to_page_cache() will help identify the
+ * concurrent swapin and return -EEXIST.
+ */
+ if (swapcache_prepare(entry, nr_pages)) {
+ folio_put(new);
+ return ERR_PTR(-EEXIST);
+ }
+
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ new->swap = entry;
+
+ mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(new, shadow);
+ folio_add_lru(new);
+ swap_read_folio(new, NULL);
+ return new;
+}
+
/*
* When a page is moved from swapcache to shmem filecache (either by the
* usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
@@ -2006,7 +2129,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
- struct folio *folio, swp_entry_t swap)
+ struct folio *folio, swp_entry_t swap,
+ bool skip_swapcache)
{
struct address_space *mapping = inode->i_mapping;
swp_entry_t swapin_error;
@@ -2022,7 +2146,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
- delete_from_swap_cache(folio);
+ if (!skip_swapcache)
+ delete_from_swap_cache(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2126,8 +2251,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct swap_info_struct *si;
struct folio *folio = NULL;
+ bool skip_swapcache = false;
swp_entry_t swap;
- int error, nr_pages;
+ int error, nr_pages, order, split_order;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@@ -2146,8 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
+ order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int split_order;
+ bool fallback_order0 = false;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -2157,6 +2284,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
}
/*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ */
+ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled()))
+ fallback_order0 = true;
+
+ /* Skip swapcache for synchronous device. */
+ if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
+ if (!IS_ERR(folio)) {
+ skip_swapcache = true;
+ goto alloced;
+ }
+
+ /*
+ * Fallback to swapin order-0 folio unless the swap entry
+ * already exists.
+ */
+ error = PTR_ERR(folio);
+ folio = NULL;
+ if (error == -EEXIST)
+ goto failed;
+ }
+
+ /*
* Now swap device can only swap in order 0 folio, then we
* should split the large swap entry stored in the pagecache
* if necessary.
@@ -2184,13 +2338,38 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
+ } else if (order != folio_order(folio)) {
+ /*
+ * Swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
+ */
+ split_order = shmem_split_large_entry(inode, index, swap, gfp);
+ if (split_order < 0) {
+ error = split_order;
+ goto failed;
+ }
+
+ /*
+ * If the large swap entry has already been split, it is
+ * necessary to recalculate the new swap entry based on
+ * the old order alignment.
+ */
+ if (split_order > 0) {
+ pgoff_t offset = index - round_down(index, 1 << split_order);
+
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
}
+alloced:
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
- if (!folio_test_swapcache(folio) ||
+ if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ !shmem_confirm_swap(mapping, index, swap) ||
+ xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST;
goto unlock;
}
@@ -2224,7 +2403,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- delete_from_swap_cache(folio);
+ if (skip_swapcache) {
+ folio->swap.val = 0;
+ swapcache_clear(si, swap, nr_pages);
+ } else {
+ delete_from_swap_cache(folio);
+ }
folio_mark_dirty(folio);
swap_free_nr(swap, nr_pages);
put_swap_device(si);
@@ -2235,8 +2419,11 @@ failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
if (error == -EIO)
- shmem_set_folio_swapin_error(inode, index, folio, swap);
+ shmem_set_folio_swapin_error(inode, index, folio, swap,
+ skip_swapcache);
unlock:
+ if (skip_swapcache)
+ swapcache_clear(si, swap, folio_nr_pages(folio));
if (folio) {
folio_unlock(folio);
folio_put(folio);
@@ -2752,12 +2939,6 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
- int ret;
-
- ret = seal_check_write(info->seals, vma);
- if (ret)
- return ret;
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
@@ -3329,7 +3510,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3356,7 +3537,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3443,7 +3624,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
@@ -3731,16 +3912,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -4585,48 +4766,37 @@ bad_value:
return invalfc(fc, "Bad value for '%s'", param->key);
}
-static int shmem_parse_options(struct fs_context *fc, void *data)
+static char *shmem_next_opt(char **s)
{
- char *options = data;
+ char *sbegin = *s;
+ char *p;
- if (options) {
- int err = security_sb_eat_lsm_opts(options, &fc->security);
- if (err)
- return err;
- }
+ if (sbegin == NULL)
+ return NULL;
- while (options != NULL) {
- char *this_char = options;
- for (;;) {
- /*
- * NUL-terminate this option: unfortunately,
- * mount options form a comma-separated list,
- * but mpol's nodelist may also contain commas.
- */
- options = strchr(options, ',');
- if (options == NULL)
- break;
- options++;
- if (!isdigit(*options)) {
- options[-1] = '\0';
- break;
- }
- }
- if (*this_char) {
- char *value = strchr(this_char, '=');
- size_t len = 0;
- int err;
-
- if (value) {
- *value++ = '\0';
- len = strlen(value);
- }
- err = vfs_parse_fs_string(fc, this_char, value, len);
- if (err < 0)
- return err;
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ for (;;) {
+ p = strchr(*s, ',');
+ if (p == NULL)
+ break;
+ *s = p + 1;
+ if (!isdigit(*(p+1))) {
+ *p = '\0';
+ return sbegin;
}
}
- return 0;
+
+ *s = NULL;
+ return sbegin;
+}
+
+static int shmem_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
}
/*
@@ -4893,7 +5063,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->gid = ctx->gid;
sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
- sbinfo->huge = ctx->huge;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ else
+ sbinfo->huge = tmpfs_huge;
+#endif
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
@@ -4971,7 +5146,7 @@ static const struct fs_context_operations shmem_fs_context_ops = {
.free = shmem_free_fc,
.get_tree = shmem_get_tree,
#ifdef CONFIG_TMPFS
- .parse_monolithic = shmem_parse_options,
+ .parse_monolithic = shmem_parse_monolithic,
.parse_param = shmem_parse_one,
.reconfigure = shmem_reconfigure,
#endif
@@ -5444,6 +5619,21 @@ static int __init setup_transparent_hugepage_shmem(char *str)
}
__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
+static int __init setup_transparent_hugepage_tmpfs(char *str)
+{
+ int huge;
+
+ huge = shmem_parse_huge(str);
+ if (huge < 0) {
+ pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
+ return huge;
+ }
+
+ tmpfs_huge = huge;
+ return 1;
+}
+__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
+
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_shmem(char *str)
{
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 4a85b94d12ce..794bd433cce0 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -195,8 +195,6 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
{
- struct dentry *entry;
- char buf[128];
const char *new, *old;
va_list ap;
int ret = 0;
@@ -213,18 +211,8 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
old = shrinker->name;
shrinker->name = new;
- if (shrinker->debugfs_entry) {
- snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
- shrinker->debugfs_id);
-
- entry = debugfs_rename(shrinker_debugfs_root,
- shrinker->debugfs_entry,
- shrinker_debugfs_root, buf);
- if (IS_ERR(entry))
- ret = PTR_ERR(entry);
- else
- shrinker->debugfs_entry = entry;
- }
+ ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d",
+ shrinker->name, shrinker->debugfs_id);
mutex_unlock(&shrinker_mutex);
diff --git a/mm/slab.h b/mm/slab.h
index 632fedd71fea..05a21dc796e0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -128,7 +128,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
/**
* slab_folio - The folio allocated for a slab
- * @slab: The slab.
+ * @s: The slab.
*
* Slabs are allocated as folios that contain the individual objects and are
* using some fields in the first struct page of the folio - those fields are
@@ -159,7 +159,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
/**
* slab_page - The first struct page allocated for a slab
- * @slab: The slab.
+ * @s: The slab.
*
* A convenience wrapper for converting slab to the first struct page of the
* underlying folio, to communicate with code not yet converted to folio or
@@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
-/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
- SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
-#else
-#define SLAB_DEBUG_FLAGS (0)
-#endif
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-
-/* Common flags available with current configuration */
-#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
-
-/* Common flags permitted for kmem_cache_create */
-#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
- SLAB_RED_ZONE | \
- SLAB_POISON | \
- SLAB_STORE_USER | \
- SLAB_TRACE | \
- SLAB_CONSISTENCY_CHECKS | \
- SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | \
- SLAB_ACCOUNT | \
- SLAB_KMALLOC | \
- SLAB_NO_MERGE | \
- SLAB_NO_USER_FLAGS)
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
#endif
+void kvfree_rcu_cb(struct rcu_head *head);
+
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a29457bef626..5be257e03c7c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -28,7 +28,9 @@
#include <asm/page.h>
#include <linux/memcontrol.h>
#include <linux/stackdepot.h>
+#include <trace/events/rcu.h>
+#include "../kernel/rcu/rcu.h"
#include "internal.h"
#include "slab.h"
@@ -296,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
+#else
+ flags &= ~SLAB_DEBUG_FLAGS;
#endif
mutex_lock(&slab_mutex);
@@ -305,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
goto out_unlock;
}
- /* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
- /*
- * Some allocators will constraint the set of valid flags to a subset
- * of all flags. We expect them to define CACHE_CREATE_MASK in this
- * case, and we'll just provide them with a sanitized version of the
- * passed flags.
- */
- flags &= CACHE_CREATE_MASK;
-
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
WARN_ON(!args->usersize && args->useroffset) ||
@@ -1282,3 +1277,908 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head) {
+ kasan_record_aux_stack(ptr);
+ call_rcu(head, kvfree_rcu_cb);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree(ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+void __init kvfree_rcu_init(void)
+{
+}
+
+#else /* CONFIG_KVFREE_RCU_BATCHED */
+
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 5;
+module_param(rcu_min_cached_objs, int, 0444);
+
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
+static struct workqueue_struct *rcu_reclaim_wq;
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
+#define KFREE_N_BATCHES 2
+#define FREE_N_CHANNELS 2
+
+/**
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
+ * @nr_records: Number of active pointers in the array
+ * @records: Array of the kvfree_rcu() pointers
+ */
+struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ struct rcu_gp_oldstate gp_snap;
+ unsigned long nr_records;
+ void *records[] __counted_by(nr_records);
+};
+
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KVFREE_BULK_MAX_ENTR \
+ ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+
+/**
+ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+ * @head_free: List of kfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @krcp: Pointer to @kfree_rcu_cpu structure
+ */
+
+struct kfree_rcu_cpu_work {
+ struct rcu_work rcu_work;
+ struct rcu_head *head_free;
+ struct rcu_gp_oldstate head_free_gp_snap;
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
+ struct kfree_rcu_cpu *krcp;
+};
+
+/**
+ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+ * @lock: Synchronize access to this structure
+ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+ * @initialized: The @rcu_work fields have been initialized
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
+ * @bkvcache:
+ * A simple cache list that contains objects for reuse purpose.
+ * In order to save some per-cpu space the list is singular.
+ * Even though it is lockless an access has to be protected by the
+ * per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
+ * @nr_bkv_objs: number of allocated objects at @bkvcache.
+ *
+ * This is a per-CPU structure. The reason that it is not included in
+ * the rcu_data structure is to permit this code to be extracted from
+ * the RCU files. Such extraction could allow further optimization of
+ * the interactions with the slab allocators.
+ */
+struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
+ struct rcu_head *head;
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
+ struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+ raw_spinlock_t lock;
+ struct delayed_work monitor_work;
+ bool initialized;
+
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
+ struct llist_head bkvcache;
+ int nr_bkv_objs;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
+
+static __always_inline void
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+ int i;
+
+ for (i = 0; i < bhead->nr_records; i++)
+ debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
+#endif
+}
+
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ local_irq_save(*flags); // For safely calling this_cpu_ptr().
+ krcp = this_cpu_ptr(&krc);
+ raw_spin_lock(&krcp->lock);
+
+ return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static inline struct kvfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+ if (!krcp->nr_bkv_objs)
+ return NULL;
+
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
+ return (struct kvfree_rcu_bulk_data *)
+ llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode)
+{
+ // Check the limit.
+ if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+ return false;
+
+ llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
+ return true;
+}
+
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
+
+ if (!rcu_min_cached_objs)
+ return 0;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
+
+ return freed;
+}
+
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
+
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ "slab", bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ "slab", bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
+ }
+ rcu_lock_release(&rcu_callback_map);
+ }
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
+ for (; head; head = next) {
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
+
+ next = head->next;
+ debug_rcu_head_unqueue((struct rcu_head *)ptr);
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_kvfree_callback("slab", head, offset);
+
+ kvfree(ptr);
+
+ rcu_lock_release(&rcu_callback_map);
+ cond_resched_tasks_rcu_qs();
+ }
+}
+
+/*
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
+ */
+static void kfree_rcu_work(struct work_struct *work)
+{
+ unsigned long flags;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
+ struct kfree_rcu_cpu *krcp;
+ struct kfree_rcu_cpu_work *krwp;
+ struct rcu_gp_oldstate head_gp_snap;
+ int i;
+
+ krwp = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu_work, rcu_work);
+ krcp = krwp->krcp;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
+
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Handle the first two channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ /*
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
+ */
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
+}
+
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krcp->bulk_head[i]))
+ return true;
+
+ return !!READ_ONCE(krcp->head);
+}
+
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
+}
+
+static void
+__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+ return;
+ }
+ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ __schedule_delayed_monitor_work(krcp);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+}
+
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
+}
+
+/*
+ * Return: %true if a work is queued, %false otherwise.
+ */
+static bool
+kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
+{
+ unsigned long flags;
+ bool queued = false;
+ int i, j;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+
+ // Attempt to start a new batch.
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
+
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
+ }
+ }
+
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
+ if (!krwp->head_free) {
+ krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. Break
+ // the loop since it is done with this CPU thus
+ // queuing an RCU work is _always_ success here.
+ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
+ WARN_ON_ONCE(!queued);
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ return queued;
+}
+
+/*
+ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ */
+static void kfree_rcu_monitor(struct work_struct *work)
+{
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
+
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
+
+ // Queue a batch for a rest.
+ kvfree_rcu_queue_batch(krcp);
+
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work.work);
+ unsigned long flags;
+ int nr_pages;
+ bool pushed;
+ int i;
+
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!bnode)
+ break;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
+}
+
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags. If can_alloc is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// Returns true if ptr was successfully recorded, else the caller must
+// use a fallback.
+static inline bool
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+ unsigned long *flags, void *ptr, bool can_alloc)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ int idx;
+
+ *krcp = krc_this_cpu_lock(flags);
+ if (unlikely(!(*krcp)->initialized))
+ return false;
+
+ idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
+
+ /* Check if a new block is required. */
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(*krcp);
+ if (!bnode && can_alloc) {
+ krc_this_cpu_unlock(*krcp, *flags);
+
+ // __GFP_NORETRY - allows a light-weight direct reclaim
+ // what is OK from minimizing of fallback hitting point of
+ // view. Apart of that it forbids any OOM invoking what is
+ // also beneficial since we are about to release memory soon.
+ //
+ // __GFP_NOMEMALLOC - prevents from consuming of all the
+ // memory reserves. Please note we have a fallback path.
+ //
+ // __GFP_NOWARN - it is supposed that an allocation can
+ // be failed under low memory or high memory pressure
+ // scenarios.
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+ }
+
+ if (!bnode)
+ return false;
+
+ // Initialize the new block and attach it.
+ bnode->nr_records = 0;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
+ }
+
+ // Finally insert and update the GP for this page.
+ bnode->nr_records++;
+ bnode->records[bnode->nr_records - 1] = ptr;
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
+ atomic_inc(&(*krcp)->bulk_count[idx]);
+
+ return true;
+}
+
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
+ return HRTIMER_NORESTART;
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(rcu_reclaim_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+ }
+}
+
+void __init kfree_rcu_scheduler_running(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
+ }
+}
+
+/*
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
+ *
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
+ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
+ * be free'd in workqueue context. This allows us to: batch requests together to
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
+ */
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ unsigned long flags;
+ struct kfree_rcu_cpu *krcp;
+ bool success;
+
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
+ might_sleep();
+
+ // Queue the object but don't yet schedule the batch.
+ if (debug_rcu_head_queue(ptr)) {
+ // Probable double kfree_rcu(), just leak.
+ WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
+ __func__, head);
+
+ // Mark as success and leave.
+ return;
+ }
+
+ kasan_record_aux_stack(ptr);
+ success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+ if (!success) {
+ run_page_cache_worker(krcp);
+
+ if (head == NULL)
+ // Inline if kvfree_rcu(one_arg) call.
+ goto unlock_return;
+
+ head->func = ptr;
+ head->next = krcp->head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
+ success = true;
+ }
+
+ /*
+ * The kvfree_rcu() caller considers the pointer freed at this point
+ * and likely removes any references to it. Since the actual slab
+ * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+ * this object (no scanning or false positives reporting).
+ */
+ kmemleak_ignore(ptr);
+
+ // Set timer to drain after KFREE_DRAIN_JIFFIES.
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+ __schedule_delayed_monitor_work(krcp);
+
+unlock_return:
+ krc_this_cpu_unlock(krcp, flags);
+
+ /*
+ * Inline kvfree() after synchronize_rcu(). We can do
+ * it from might_sleep() context only, so the current
+ * CPU can pass the QS state.
+ */
+ if (!success) {
+ debug_rcu_head_unqueue((struct rcu_head *) ptr);
+ synchronize_rcu();
+ kvfree(ptr);
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+/**
+ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
+ *
+ * Note that a single argument of kvfree_rcu() call has a slow path that
+ * triggers synchronize_rcu() following by freeing a pointer. It is done
+ * before the return from the function. Therefore for any single-argument
+ * call that will result in a kfree() to a cache that is to be destroyed
+ * during module exit, it is developer's responsibility to ensure that all
+ * such calls have returned before the call to kmem_cache_destroy().
+ */
+void kvfree_rcu_barrier(void)
+{
+ struct kfree_rcu_cpu_work *krwp;
+ struct kfree_rcu_cpu *krcp;
+ bool queued;
+ int i, cpu;
+
+ /*
+ * Firstly we detach objects and queue them over an RCU-batch
+ * for all CPUs. Finally queued works are flushed for each CPU.
+ *
+ * Please note. If there are outstanding batches for a particular
+ * CPU, those have to be finished first following by queuing a new.
+ */
+ for_each_possible_cpu(cpu) {
+ krcp = per_cpu_ptr(&krc, cpu);
+
+ /*
+ * Check if this CPU has any objects which have been queued for a
+ * new GP completion. If not(means nothing to detach), we are done
+ * with it. If any batch is pending/running for this "krcp", below
+ * per-cpu flush_rcu_work() waits its completion(see last step).
+ */
+ if (!need_offload_krc(krcp))
+ continue;
+
+ while (1) {
+ /*
+ * If we are not able to queue a new RCU work it means:
+ * - batches for this CPU are still in flight which should
+ * be flushed first and then repeat;
+ * - no objects to detach, because of concurrency.
+ */
+ queued = kvfree_rcu_queue_batch(krcp);
+
+ /*
+ * Bail out, if there is no need to offload this "krcp"
+ * anymore. As noted earlier it can run concurrently.
+ */
+ if (queued || !need_offload_krc(krcp))
+ break;
+
+ /* There are ongoing batches. */
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
+ flush_rcu_work(&krwp->rcu_work);
+ }
+ }
+ }
+
+ /*
+ * Now we guarantee that all objects are flushed.
+ */
+ for_each_possible_cpu(cpu) {
+ krcp = per_cpu_ptr(&krc, cpu);
+
+ /*
+ * A monitor work can drain ready to reclaim objects
+ * directly. Wait its completion if running or pending.
+ */
+ cancel_delayed_work_sync(&krcp->monitor_work);
+
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ krwp = &(krcp->krw_arr[i]);
+ flush_rcu_work(&krwp->rcu_work);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
+
+static unsigned long
+kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count += krc_count(krcp);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
+ }
+
+ return count == 0 ? SHRINK_EMPTY : count;
+}
+
+static unsigned long
+kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu, freed = 0;
+
+ for_each_possible_cpu(cpu) {
+ int count;
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ count = krc_count(krcp);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
+
+ sc->nr_to_scan -= count;
+ freed += count;
+
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ return freed == 0 ? SHRINK_STOP : freed;
+}
+
+void __init kvfree_rcu_init(void)
+{
+ int cpu;
+ int i, j;
+ struct shrinker *kfree_rcu_shrinker;
+
+ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_reclaim_wq);
+
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
+ krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
+ }
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
+ INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ krcp->initialized = true;
+ }
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
+}
+
+#endif /* CONFIG_KVFREE_RCU_BATCHED */
+
diff --git a/mm/slub.c b/mm/slub.c
index c2151c9fee22..5eac408e818e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,6 +19,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
@@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object)
set_orig_size(s, (void *)object, s->object_size);
}
-static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
+ va_copy(args, argsp);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
va_end(args);
}
+static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ __slab_bug(s, fmt, args);
+ va_end(args);
+}
+
__printf(2, 3)
-static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
-
- dump_stack();
}
static void object_err(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *reason)
+ u8 *object, const char *reason)
{
if (slab_add_kunit_errors())
return;
- slab_bug(s, "%s", reason);
+ slab_bug(s, reason);
print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
}
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
return false;
}
+static void __slab_err(struct slab *slab)
+{
+ if (slab_in_kunit_test())
+ return;
+
+ print_slab_info(slab);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
+}
+
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
- char buf[100];
if (slab_add_kunit_errors())
return;
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ __slab_bug(s, fmt, args);
va_end(args);
- slab_bug(s, "%s", buf);
- print_slab_info(slab);
- dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ __slab_err(slab);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
s->inuse - poison_size);
}
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
void *from, void *to)
{
slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
@@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static pad_check_attributes int
check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *what,
- u8 *start, unsigned int value, unsigned int bytes)
+ u8 *object, const char *what, u8 *start, unsigned int value,
+ unsigned int bytes, bool slab_obj_print)
{
u8 *fault;
u8 *end;
@@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
if (slab_add_kunit_errors())
goto skip_bug_print;
- slab_bug(s, "%s overwritten", what);
- pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault - addr,
- fault[0], value);
+ pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ what, fault, end - 1, fault - addr, fault[0], value);
+
+ if (slab_obj_print)
+ object_err(s, slab, object, "Object corrupt");
skip_bug_print:
restore_bytes(s, what, value, fault, end);
@@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
return 1;
return check_bytes_and_report(s, slab, p, "Object padding",
- p + off, POISON_INUSE, size_from_object(s) - off);
+ p + off, POISON_INUSE, size_from_object(s) - off, true);
}
/* Check the pad bytes at the end of a slab page */
@@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
- fault, end - 1, fault - start);
+ slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
+ __slab_err(slab);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
}
@@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
- object - s->red_left_pad, val, s->red_left_pad))
+ object - s->red_left_pad, val, s->red_left_pad, ret))
ret = 0;
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
- endobject, val, s->inuse - s->object_size))
+ endobject, val, s->inuse - s->object_size, ret))
ret = 0;
if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
@@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->object_size > orig_size &&
!check_bytes_and_report(s, slab, object,
"kmalloc Redzone", p + orig_size,
- val, s->object_size - orig_size)) {
+ val, s->object_size - orig_size, ret)) {
ret = 0;
}
}
@@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
if (!check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
- s->inuse - s->object_size))
+ s->inuse - s->object_size, ret))
ret = 0;
}
}
@@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (kasan_meta_size < s->object_size - 1 &&
!check_bytes_and_report(s, slab, p, "Poison",
p + kasan_meta_size, POISON_FREE,
- s->object_size - kasan_meta_size - 1))
+ s->object_size - kasan_meta_size - 1, ret))
ret = 0;
if (kasan_meta_size < s->object_size &&
!check_bytes_and_report(s, slab, p, "End Poison",
- p + s->object_size - 1, POISON_END, 1))
+ p + s->object_size - 1, POISON_END, 1, ret))
ret = 0;
}
/*
@@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
ret = 0;
}
- if (!ret && !slab_in_kunit_test()) {
- print_trailer(s, slab, object);
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- }
-
return ret;
}
@@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
* Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
+static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
@@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
fp = slab->freelist;
while (fp && nr <= slab->objects) {
if (fp == search)
- return 1;
+ return true;
if (!check_valid_pointer(s, slab, fp)) {
if (object) {
object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, slab, "Freepointer corrupt");
slab->freelist = NULL;
slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
- return 0;
+ return false;
}
- break;
}
object = fp;
fp = get_freepointer(s, object);
nr++;
}
+ if (nr > slab->objects) {
+ slab_err(s, slab, "Freelist cycle detected");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab_fix(s, "Freelist cleared");
+ return false;
+ }
+
max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s,
slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
} else if (!slab->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
- dump_stack();
- } else
+ slab_err(NULL, slab, "No slab cache for object 0x%p",
+ object);
+ } else {
object_err(s, slab, object,
- "page slab pointer corrupt.");
+ "page slab pointer corrupt.");
+ }
return 0;
}
return 1;
@@ -2311,7 +2334,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
* We have to do this manually because the rcu_head is
* not located inside the object.
*/
- kasan_record_aux_stack_noalloc(x);
+ kasan_record_aux_stack(x);
delayed_free->object = x;
call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
@@ -2420,17 +2443,15 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
unsigned int order = oo_order(oo);
if (node == NUMA_NO_NODE)
- folio = (struct folio *)alloc_pages(flags, order);
+ folio = (struct folio *)alloc_frozen_pages(flags, order);
else
- folio = (struct folio *)__alloc_pages_node(node, flags, order);
+ folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
if (!folio)
return NULL;
slab = folio_slab(folio);
__folio_set_slab(folio);
- /* Make the flag visible before any changes to folio->mapping */
- smp_wmb();
if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
@@ -2651,12 +2672,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
__slab_clear_pfmemalloc(slab);
folio->mapping = NULL;
- /* Make the mapping reset visible before clearing the flag */
- smp_wmb();
__folio_clear_slab(folio);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
- __free_pages(&folio->page, order);
+ free_frozen_pages(&folio->page, order);
}
static void rcu_free_slab(struct rcu_head *h)
@@ -4245,6 +4264,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
+ __folio_set_large_kmalloc(folio);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -4720,6 +4740,11 @@ static void free_large_kmalloc(struct folio *folio, void *object)
{
unsigned int order = folio_order(folio);
+ if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
+ dump_page(&folio->page, "Not a kmalloc allocation");
+ return;
+ }
+
if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
@@ -4729,9 +4754,55 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
+ __folio_clear_large_kmalloc(folio);
folio_put(folio);
}
+/*
+ * Given an rcu_head embedded within an object obtained from kvmalloc at an
+ * offset < 4k, free the object in question.
+ */
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+ void *obj = head;
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *slab_addr;
+
+ if (is_vmalloc_addr(obj)) {
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ vfree(obj);
+ return;
+ }
+
+ folio = virt_to_folio(obj);
+ if (!folio_test_slab(folio)) {
+ /*
+ * rcu_head offset can be only less than page size so no need to
+ * consider folio order
+ */
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ free_large_kmalloc(folio, obj);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ slab_addr = folio_address(folio);
+
+ if (is_kfence_address(obj)) {
+ obj = kfence_object_start(obj);
+ } else {
+ unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+ obj = slab_addr + s->size * idx;
+ obj = fixup_red_left(s, obj);
+ }
+
+ slab_free(s, slab, obj, _RET_IP_);
+}
+
/**
* kfree - free previously allocated memory
* @object: pointer returned by kmalloc() or kmem_cache_alloc()
@@ -4882,6 +4953,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
}
EXPORT_SYMBOL(krealloc_noprof);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
+/**
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
+ kmalloc_gfp_adjust(flags, size),
+ node, _RET_IP_);
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ void *n;
+
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(kvrealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5574,14 +5807,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
- const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
void *p;
- slab_err(s, slab, text, s->name);
+ if (!slab_add_kunit_errors())
+ slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
spin_lock(&object_map_lock);
__fill_map(object_map, s, slab);
@@ -5596,6 +5829,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
}
}
spin_unlock(&object_map_lock);
+
+ __slab_err(slab);
#endif
}
@@ -5616,8 +5851,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, slab,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ list_slab_objects(s, slab);
}
}
spin_unlock_irq(&n->list_lock);
@@ -7513,10 +7747,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
return -ENOMEM;
}
- if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
- alloc = TRACK_ALLOC;
- else
- alloc = TRACK_FREE;
+ alloc = debugfs_get_aux_num(filep);
if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
bitmap_free(obj_map);
@@ -7572,11 +7803,11 @@ static void debugfs_slab_add(struct kmem_cache *s)
slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
- debugfs_create_file("alloc_traces", 0400,
- slab_cache_dir, s, &slab_debugfs_fops);
+ debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
+ TRACK_ALLOC, &slab_debugfs_fops);
- debugfs_create_file("free_traces", 0400,
- slab_cache_dir, s, &slab_debugfs_fops);
+ debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
+ TRACK_FREE, &slab_debugfs_fops);
}
void debugfs_slab_release(struct kmem_cache *s)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index cec67c5f37d8..3287ebadd167 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -31,6 +31,8 @@
#include <asm/dma.h>
#include <asm/pgalloc.h>
+#include "internal.h"
+
/*
* Allocate a block of memory to be used to back the virtual memory map
* or to back the page tables that are used to create the mapping.
@@ -42,8 +44,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return memblock_alloc_try_nid_raw(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ return memmap_alloc(size, align, goal, node, false);
}
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
diff --git a/mm/sparse.c b/mm/sparse.c
index 13b6624d3562..133b033d0cba 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -257,10 +257,7 @@ static void __init memblocks_present(void)
size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_alloc(size, align);
- if (!mem_section)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, size, align);
+ mem_section = memblock_alloc_or_panic(size, align);
}
#endif
diff --git a/mm/swap.c b/mm/swap.c
index 10decd9dffa1..fc8281ef4241 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -109,7 +109,7 @@ void __folio_put(struct folio *folio)
page_cache_release(folio);
folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
- free_unref_page(&folio->page, folio_order(folio));
+ free_frozen_pages(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);
@@ -379,37 +379,58 @@ static void __lru_cache_activate_folio(struct folio *folio)
}
#ifdef CONFIG_LRU_GEN
-static void folio_inc_refs(struct folio *folio)
+
+static void lru_gen_inc_refs(struct folio *folio)
{
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
if (folio_test_unevictable(folio))
return;
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio)) {
- folio_set_referenced(folio);
- return;
- }
-
- if (!folio_test_workingset(folio)) {
- folio_set_workingset(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
return;
}
- /* see the comment on MAX_NR_TIERS */
do {
- new_flags = old_flags & LRU_REFS_MASK;
- if (new_flags == LRU_REFS_MASK)
- break;
+ if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) {
+ if (!folio_test_workingset(folio))
+ folio_set_workingset(folio);
+ return;
+ }
- new_flags += BIT(LRU_REFS_PGOFF);
- new_flags |= old_flags & ~LRU_REFS_MASK;
+ new_flags = old_flags + BIT(LRU_REFS_PGOFF);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
}
-#else
-static void folio_inc_refs(struct folio *folio)
+
+static bool lru_gen_clear_refs(struct folio *folio)
+{
+ struct lru_gen_folio *lrugen;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+
+ if (gen < 0)
+ return true;
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);
+
+ lrugen = &folio_lruvec(folio)->lrugen;
+ /* whether can do without shuffling under the LRU lock */
+ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type]));
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_inc_refs(struct folio *folio)
+{
+}
+
+static bool lru_gen_clear_refs(struct folio *folio)
{
+ return false;
}
+
#endif /* CONFIG_LRU_GEN */
/**
@@ -427,8 +448,10 @@ static void folio_inc_refs(struct folio *folio)
*/
void folio_mark_accessed(struct folio *folio)
{
+ if (folio_test_dropbehind(folio))
+ return;
if (lru_gen_enabled()) {
- folio_inc_refs(folio);
+ lru_gen_inc_refs(folio);
return;
}
@@ -474,7 +497,7 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- /* see the comment in lru_gen_add_folio() */
+ /* see the comment in lru_gen_folio_seq() */
if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
@@ -524,7 +547,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
*/
static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
{
- bool active = folio_test_active(folio);
+ bool active = folio_test_active(folio) || lru_gen_enabled();
long nr_pages = folio_nr_pages(folio);
if (folio_test_unevictable(folio))
@@ -589,7 +612,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
- folio_clear_referenced(folio);
+ if (lru_gen_enabled())
+ lru_gen_clear_refs(folio);
+ else
+ folio_clear_referenced(folio);
/*
* Lazyfree folios are clean anonymous folios. They have
* the swapbacked flag cleared, to distinguish them from normal
@@ -657,6 +683,9 @@ void deactivate_file_folio(struct folio *folio)
if (folio_test_unevictable(folio))
return;
+ if (lru_gen_enabled() && lru_gen_clear_refs(folio))
+ return;
+
folio_batch_add_and_move(folio, lru_deactivate_file, true);
}
@@ -670,7 +699,10 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
+ if (folio_test_unevictable(folio))
+ return;
+
+ if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
return;
folio_batch_add_and_move(folio, lru_deactivate, true);
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index da1278f0563b..1007c30f12e2 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -6,149 +6,107 @@
#include <linux/swapops.h> /* depends on mm.h include */
static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
- struct page **map;
- unsigned long length;
- spinlock_t lock;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */
+#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short))
+#define ID_SHIFT (BITS_PER_TYPE(unsigned short))
+#define ID_MASK (BIT(ID_SHIFT) - 1)
struct swap_cgroup {
- unsigned short id;
+ atomic_t ids;
};
-#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
-
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- * - we have no race in "exchange" when we're accessed via SwapCache because
- * SwapCache(and its swp_entry) is under lock.
- * - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
- struct page *page;
- struct swap_cgroup_ctrl *ctrl;
- unsigned long idx, max;
-
- ctrl = &swap_cgroup_ctrl[type];
-
- for (idx = 0; idx < ctrl->length; idx++) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
- goto not_enough_page;
- ctrl->map[idx] = page;
- if (!(idx % SWAP_CLUSTER_MAX))
- cond_resched();
- }
- return 0;
-not_enough_page:
- max = idx;
- for (idx = 0; idx < max; idx++)
- __free_page(ctrl->map[idx]);
+struct swap_cgroup_ctrl {
+ struct swap_cgroup *map;
+};
- return -ENOMEM;
-}
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
- pgoff_t offset)
+static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
+ pgoff_t offset)
{
- struct page *mappage;
- struct swap_cgroup *sc;
+ unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
+ unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids);
+
+ BUILD_BUG_ON(!is_power_of_2(ID_PER_SC));
+ BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t));
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
+ return (old_ids >> shift) & ID_MASK;
}
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
- struct swap_cgroup_ctrl **ctrlp)
+static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
+ pgoff_t offset,
+ unsigned short new_id)
{
- pgoff_t offset = swp_offset(ent);
- struct swap_cgroup_ctrl *ctrl;
-
- ctrl = &swap_cgroup_ctrl[swp_type(ent)];
- if (ctrlp)
- *ctrlp = ctrl;
- return __lookup_swap_cgroup(ctrl, offset);
+ unsigned short old_id;
+ struct swap_cgroup *sc = &map[offset / ID_PER_SC];
+ unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
+ unsigned int new_ids, old_ids = atomic_read(&sc->ids);
+
+ do {
+ old_id = (old_ids >> shift) & ID_MASK;
+ new_ids = (old_ids & ~(ID_MASK << shift));
+ new_ids |= ((unsigned int)new_id) << shift;
+ } while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids));
+
+ return old_id;
}
/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries.
+ * These entries must belong to one single folio, and that folio
+ * must be being charged for swap space (swap out), and these
+ * entries must not have been charged
*
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
+ * @folio: the folio that the swap entry belongs to
+ * @id: mem_cgroup ID to be recorded
+ * @ent: the first swap entry to be recorded
*/
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new)
+void swap_cgroup_record(struct folio *folio, unsigned short id,
+ swp_entry_t ent)
{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned long flags;
- unsigned short retval;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- retval = sc->id;
- if (retval == old)
- sc->id = new;
- else
- retval = 0;
- spin_unlock_irqrestore(&ctrl->lock, flags);
- return retval;
+ unsigned int nr_ents = folio_nr_pages(folio);
+ struct swap_cgroup *map;
+ pgoff_t offset, end;
+ unsigned short old;
+
+ offset = swp_offset(ent);
+ end = offset + nr_ents;
+ map = swap_cgroup_ctrl[swp_type(ent)].map;
+
+ do {
+ old = __swap_cgroup_id_xchg(map, offset, id);
+ VM_BUG_ON(old);
+ } while (++offset != end);
}
/**
- * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * swap_cgroup_clear - clear mem_cgroup for a set of swap entries.
+ * These entries must be being uncharged from swap. They either
+ * belongs to one single folio in the swap cache (swap in for
+ * cgroup v1), or no longer have any users (slot freeing).
+ *
* @ent: the first swap entry to be recorded into
- * @id: mem_cgroup to be recorded
* @nr_ents: number of swap entries to be recorded
*
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
+ * Returns the existing old value.
*/
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
- unsigned int nr_ents)
+unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned short old;
- unsigned long flags;
pgoff_t offset = swp_offset(ent);
pgoff_t end = offset + nr_ents;
+ struct swap_cgroup *map;
+ unsigned short old, iter = 0;
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- old = sc->id;
- for (;;) {
- VM_BUG_ON(sc->id != old);
- sc->id = id;
- offset++;
- if (offset == end)
- break;
- if (offset % SC_PER_PAGE)
- sc++;
- else
- sc = __lookup_swap_cgroup(ctrl, offset);
- }
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ offset = swp_offset(ent);
+ end = offset + nr_ents;
+ map = swap_cgroup_ctrl[swp_type(ent)].map;
+
+ do {
+ old = __swap_cgroup_id_xchg(map, offset, 0);
+ if (!iter)
+ iter = old;
+ VM_BUG_ON(iter != old);
+ } while (++offset != end);
return old;
}
@@ -161,39 +119,33 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
+ struct swap_cgroup_ctrl *ctrl;
+
if (mem_cgroup_disabled())
return 0;
- return lookup_swap_cgroup(ent, NULL)->id;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent));
}
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
- void *array;
- unsigned long length;
+ struct swap_cgroup *map;
struct swap_cgroup_ctrl *ctrl;
if (mem_cgroup_disabled())
return 0;
- length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
-
- array = vcalloc(length, sizeof(void *));
- if (!array)
+ BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=
+ sizeof(struct swap_cgroup));
+ map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) *
+ sizeof(struct swap_cgroup));
+ if (!map)
goto nomem;
ctrl = &swap_cgroup_ctrl[type];
mutex_lock(&swap_cgroup_mutex);
- ctrl->length = length;
- ctrl->map = array;
- spin_lock_init(&ctrl->lock);
- if (swap_cgroup_prepare(type)) {
- /* memory shortage */
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
- vfree(array);
- goto nomem;
- }
+ ctrl->map = map;
mutex_unlock(&swap_cgroup_mutex);
return 0;
@@ -205,8 +157,7 @@ nomem:
void swap_cgroup_swapoff(int type)
{
- struct page **map;
- unsigned long i, length;
+ struct swap_cgroup *map;
struct swap_cgroup_ctrl *ctrl;
if (mem_cgroup_disabled())
@@ -215,19 +166,8 @@ void swap_cgroup_swapoff(int type)
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
- length = ctrl->length;
ctrl->map = NULL;
- ctrl->length = 0;
mutex_unlock(&swap_cgroup_mutex);
- if (map) {
- for (i = 0; i < length; i++) {
- struct page *page = map[i];
- if (page)
- __free_page(page);
- if (!(i % SWAP_CLUSTER_MAX))
- cond_resched();
- }
- vfree(map);
- }
+ vfree(map);
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 13ab3b771409..9c7c171df7ba 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -43,17 +43,15 @@ static DEFINE_MUTEX(swap_slots_cache_mutex);
/* Serialize swap slots cache enable/disable operations */
static DEFINE_MUTEX(swap_slots_cache_enable_mutex);
-static void __drain_swap_slots_cache(unsigned int type);
+static void __drain_swap_slots_cache(void);
#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
-#define SLOTS_CACHE 0x1
-#define SLOTS_CACHE_RET 0x2
static void deactivate_swap_slots_cache(void)
{
mutex_lock(&swap_slots_cache_mutex);
swap_slot_cache_active = false;
- __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ __drain_swap_slots_cache();
mutex_unlock(&swap_slots_cache_mutex);
}
@@ -72,7 +70,7 @@ void disable_swap_slots_cache_lock(void)
if (swap_slot_cache_initialized) {
/* serialize with cpu hotplug operations */
cpus_read_lock();
- __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+ __drain_swap_slots_cache();
cpus_read_unlock();
}
}
@@ -113,7 +111,7 @@ out:
static int alloc_swap_slot_cache(unsigned int cpu)
{
struct swap_slots_cache *cache;
- swp_entry_t *slots, *slots_ret;
+ swp_entry_t *slots;
/*
* Do allocation outside swap_slots_cache_mutex
@@ -125,28 +123,19 @@ static int alloc_swap_slot_cache(unsigned int cpu)
if (!slots)
return -ENOMEM;
- slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
- GFP_KERNEL);
- if (!slots_ret) {
- kvfree(slots);
- return -ENOMEM;
- }
-
mutex_lock(&swap_slots_cache_mutex);
cache = &per_cpu(swp_slots, cpu);
- if (cache->slots || cache->slots_ret) {
+ if (cache->slots) {
/* cache already allocated */
mutex_unlock(&swap_slots_cache_mutex);
kvfree(slots);
- kvfree(slots_ret);
return 0;
}
if (!cache->lock_initialized) {
mutex_init(&cache->alloc_lock);
- spin_lock_init(&cache->free_lock);
cache->lock_initialized = true;
}
cache->nr = 0;
@@ -160,19 +149,16 @@ static int alloc_swap_slot_cache(unsigned int cpu)
*/
mb();
cache->slots = slots;
- cache->slots_ret = slots_ret;
mutex_unlock(&swap_slots_cache_mutex);
return 0;
}
-static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
- bool free_slots)
+static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots)
{
struct swap_slots_cache *cache;
- swp_entry_t *slots = NULL;
cache = &per_cpu(swp_slots, cpu);
- if ((type & SLOTS_CACHE) && cache->slots) {
+ if (cache->slots) {
mutex_lock(&cache->alloc_lock);
swapcache_free_entries(cache->slots + cache->cur, cache->nr);
cache->cur = 0;
@@ -183,20 +169,9 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
}
mutex_unlock(&cache->alloc_lock);
}
- if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
- spin_lock_irq(&cache->free_lock);
- swapcache_free_entries(cache->slots_ret, cache->n_ret);
- cache->n_ret = 0;
- if (free_slots && cache->slots_ret) {
- slots = cache->slots_ret;
- cache->slots_ret = NULL;
- }
- spin_unlock_irq(&cache->free_lock);
- kvfree(slots);
- }
}
-static void __drain_swap_slots_cache(unsigned int type)
+static void __drain_swap_slots_cache(void)
{
unsigned int cpu;
@@ -224,13 +199,13 @@ static void __drain_swap_slots_cache(unsigned int type)
* There are no slots on such cpu that need to be drained.
*/
for_each_online_cpu(cpu)
- drain_slots_cache_cpu(cpu, type, false);
+ drain_slots_cache_cpu(cpu, false);
}
static int free_slot_cache(unsigned int cpu)
{
mutex_lock(&swap_slots_cache_mutex);
- drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
+ drain_slots_cache_cpu(cpu, true);
mutex_unlock(&swap_slots_cache_mutex);
return 0;
}
@@ -269,39 +244,6 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
return cache->nr;
}
-void free_swap_slot(swp_entry_t entry)
-{
- struct swap_slots_cache *cache;
-
- /* Large folio swap slot is not covered. */
- zswap_invalidate(entry);
-
- cache = raw_cpu_ptr(&swp_slots);
- if (likely(use_swap_slot_cache && cache->slots_ret)) {
- spin_lock_irq(&cache->free_lock);
- /* Swap slots cache may be deactivated before acquiring lock */
- if (!use_swap_slot_cache || !cache->slots_ret) {
- spin_unlock_irq(&cache->free_lock);
- goto direct_free;
- }
- if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
- /*
- * Return slots to global pool.
- * The current swap_map value is SWAP_HAS_CACHE.
- * Set it to 0 to indicate it is available for
- * allocation in global pool
- */
- swapcache_free_entries(cache->slots_ret, cache->n_ret);
- cache->n_ret = 0;
- }
- cache->slots_ret[cache->n_ret++] = entry;
- spin_unlock_irq(&cache->free_lock);
- } else {
-direct_free:
- swapcache_free_entries(&entry, 1);
- }
-}
-
swp_entry_t folio_alloc_swap(struct folio *folio)
{
swp_entry_t entry;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0c0321b8ff7..ca42b2be64d9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b0a9071cfe1d..df7c4e8b089c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,15 +53,15 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
- unsigned int nr_pages);
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+static void swap_entry_range_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
+static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
-static struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si, unsigned long offset);
-static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
- struct swap_cluster_info *ci);
+static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+ unsigned long offset);
+static inline void unlock_cluster(struct swap_cluster_info *ci);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -129,6 +129,26 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/*
+ * Use the second highest bit of inuse_pages counter as the indicator
+ * if one swap device is on the available plist, so the atomic can
+ * still be updated arithmetically while having special data embedded.
+ *
+ * inuse_pages counter is the only thing indicating if a device should
+ * be on avail_lists or not (except swapon / swapoff). By embedding the
+ * off-list bit in the atomic counter, updates no longer need any lock
+ * to check the list status.
+ *
+ * This bit will be set if the device is not on the plist and not
+ * usable, will be cleared if the device is on the plist.
+ */
+#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
+#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
+static long swap_usage_in_pages(struct swap_info_struct *si)
+{
+ return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
+}
+
/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY 0x1
/*
@@ -222,9 +242,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* swap_map is HAS_CACHE only, which means the slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
need_reclaim = swap_is_has_cache(si, offset, nr_pages);
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
if (!need_reclaim)
goto out_unlock;
@@ -242,12 +262,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
folio_ref_sub(folio, nr_pages);
folio_set_dirty(folio);
- spin_lock(&si->lock);
- /* Only sinple page folio can be backed by zswap */
- if (nr_pages == 1)
- zswap_invalidate(entry);
- swap_entry_range_free(si, entry, nr_pages);
- spin_unlock(&si->lock);
+ ci = lock_cluster(si, offset);
+ swap_entry_range_free(si, ci, entry, nr_pages);
+ unlock_cluster(ci);
ret = nr_pages;
out_unlock:
folio_unlock(folio);
@@ -382,9 +399,23 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#endif
#define LATENCY_LIMIT 256
-static inline bool cluster_is_free(struct swap_cluster_info *info)
+static inline bool cluster_is_empty(struct swap_cluster_info *info)
{
- return info->flags & CLUSTER_FLAG_FREE;
+ return info->count == 0;
+}
+
+static inline bool cluster_is_discard(struct swap_cluster_info *info)
+{
+ return info->flags == CLUSTER_FLAG_DISCARD;
+}
+
+static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
+{
+ if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
+ return false;
+ if (!order)
+ return true;
+ return cluster_is_empty(ci) || order == ci->order;
}
static inline unsigned int cluster_index(struct swap_info_struct *si,
@@ -393,6 +424,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info;
}
+static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ return &si->cluster_info[offset / SWAPFILE_CLUSTER];
+}
+
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
@@ -404,45 +441,38 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si
{
struct swap_cluster_info *ci;
- ci = si->cluster_info;
- if (ci) {
- ci += offset / SWAPFILE_CLUSTER;
- spin_lock(&ci->lock);
- }
+ ci = offset_to_cluster(si, offset);
+ spin_lock(&ci->lock);
+
return ci;
}
static inline void unlock_cluster(struct swap_cluster_info *ci)
{
- if (ci)
- spin_unlock(&ci->lock);
+ spin_unlock(&ci->lock);
}
-/*
- * Determine the locking method in use for this device. Return
- * swap_cluster_info if SSD-style cluster-based locking is in place.
- */
-static inline struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si, unsigned long offset)
+static void move_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, struct list_head *list,
+ enum swap_cluster_flags new_flags)
{
- struct swap_cluster_info *ci;
+ VM_WARN_ON(ci->flags == new_flags);
- /* Try to use fine-grained SSD-style locking if available: */
- ci = lock_cluster(si, offset);
- /* Otherwise, fall back to traditional, coarse locking: */
- if (!ci)
- spin_lock(&si->lock);
-
- return ci;
-}
+ BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
+ lockdep_assert_held(&ci->lock);
-static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
- struct swap_cluster_info *ci)
-{
- if (ci)
- unlock_cluster(ci);
+ spin_lock(&si->lock);
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ list_add_tail(&ci->list, list);
else
- spin_unlock(&si->lock);
+ list_move_tail(&ci->list, list);
+ spin_unlock(&si->lock);
+
+ if (ci->flags == CLUSTER_FLAG_FRAG)
+ atomic_long_dec(&si->frag_cluster_nr[ci->order]);
+ else if (new_flags == CLUSTER_FLAG_FRAG)
+ atomic_long_inc(&si->frag_cluster_nr[ci->order]);
+ ci->flags = new_flags;
}
/* Add a cluster to discard list and schedule it to do discard */
@@ -458,51 +488,98 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
*/
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
-
- VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
- list_move_tail(&ci->list, &si->discard_clusters);
- ci->flags = 0;
+ VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
+ move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
schedule_work(&si->discard_work);
}
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&si->lock);
lockdep_assert_held(&ci->lock);
-
- if (ci->flags)
- list_move_tail(&ci->list, &si->free_clusters);
- else
- list_add_tail(&ci->list, &si->free_clusters);
- ci->flags = CLUSTER_FLAG_FREE;
+ move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
/*
+ * Isolate and lock the first cluster that is not contented on a list,
+ * clean its flag before taken off-list. Cluster flag must be in sync
+ * with list status, so cluster updaters can always know the cluster
+ * list status without touching si lock.
+ *
+ * Note it's possible that all clusters on a list are contented so
+ * this returns NULL for an non-empty list.
+ */
+static struct swap_cluster_info *isolate_lock_cluster(
+ struct swap_info_struct *si, struct list_head *list)
+{
+ struct swap_cluster_info *ci, *ret = NULL;
+
+ spin_lock(&si->lock);
+
+ if (unlikely(!(si->flags & SWP_WRITEOK)))
+ goto out;
+
+ list_for_each_entry(ci, list, list) {
+ if (!spin_trylock(&ci->lock))
+ continue;
+
+ /* We may only isolate and clear flags of following lists */
+ VM_BUG_ON(!ci->flags);
+ VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
+ ci->flags != CLUSTER_FLAG_FULL);
+
+ list_del(&ci->list);
+ ci->flags = CLUSTER_FLAG_NONE;
+ ret = ci;
+ break;
+ }
+out:
+ spin_unlock(&si->lock);
+
+ return ret;
+}
+
+/*
* Doing discard actually. After a cluster discard is finished, the cluster
- * will be added to free cluster list. caller should hold si->lock.
-*/
-static void swap_do_scheduled_discard(struct swap_info_struct *si)
+ * will be added to free cluster list. Discard cluster is a bit special as
+ * they don't participate in allocation or reclaim, so clusters marked as
+ * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
+ */
+static bool swap_do_scheduled_discard(struct swap_info_struct *si)
{
struct swap_cluster_info *ci;
+ bool ret = false;
unsigned int idx;
+ spin_lock(&si->lock);
while (!list_empty(&si->discard_clusters)) {
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
+ /*
+ * Delete the cluster from list to prepare for discard, but keep
+ * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
+ * pointing to it, or ran into by relocate_cluster.
+ */
list_del(&ci->list);
idx = cluster_index(si, ci);
spin_unlock(&si->lock);
-
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
SWAPFILE_CLUSTER);
- spin_lock(&si->lock);
spin_lock(&ci->lock);
- __free_cluster(si, ci);
+ /*
+ * Discard is done, clear its flags as it's off-list, then
+ * return the cluster to allocation list.
+ */
+ ci->flags = CLUSTER_FLAG_NONE;
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
+ __free_cluster(si, ci);
spin_unlock(&ci->lock);
+ ret = true;
+ spin_lock(&si->lock);
}
+ spin_unlock(&si->lock);
+ return ret;
}
static void swap_discard_work(struct work_struct *work)
@@ -511,9 +588,7 @@ static void swap_discard_work(struct work_struct *work)
si = container_of(work, struct swap_info_struct, discard_work);
- spin_lock(&si->lock);
swap_do_scheduled_discard(si);
- spin_unlock(&si->lock);
}
static void swap_users_ref_free(struct percpu_ref *ref)
@@ -524,15 +599,16 @@ static void swap_users_ref_free(struct percpu_ref *ref)
complete(&si->comp);
}
+/*
+ * Must be called after freeing if ci->count == 0, moves the cluster to free
+ * or discard list.
+ */
static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
VM_BUG_ON(ci->count != 0);
- lockdep_assert_held(&si->lock);
+ VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
lockdep_assert_held(&ci->lock);
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
-
/*
* If the swap is discardable, prepare discard the cluster
* instead of free it immediately. The cluster will be freed
@@ -548,6 +624,49 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
}
/*
+ * Must be called after freeing if ci->count != 0, moves the cluster to
+ * nonfull list.
+ */
+static void partial_free_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
+ lockdep_assert_held(&ci->lock);
+
+ if (ci->flags != CLUSTER_FLAG_NONFULL)
+ move_cluster(si, ci, &si->nonfull_clusters[ci->order],
+ CLUSTER_FLAG_NONFULL);
+}
+
+/*
+ * Must be called after allocation, moves the cluster to full or frag list.
+ * Note: allocation doesn't acquire si lock, and may drop the ci lock for
+ * reclaim, so the cluster could be any where when called.
+ */
+static void relocate_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
+{
+ lockdep_assert_held(&ci->lock);
+
+ /* Discard cluster must remain off-list or on discard list */
+ if (cluster_is_discard(ci))
+ return;
+
+ if (!ci->count) {
+ if (ci->flags != CLUSTER_FLAG_FREE)
+ free_cluster(si, ci);
+ } else if (ci->count != SWAPFILE_CLUSTER) {
+ if (ci->flags != CLUSTER_FLAG_FRAG)
+ move_cluster(si, ci, &si->frag_clusters[ci->order],
+ CLUSTER_FLAG_FRAG);
+ } else {
+ if (ci->flags != CLUSTER_FLAG_FULL)
+ move_cluster(si, ci, &si->full_clusters,
+ CLUSTER_FLAG_FULL);
+ }
+}
+
+/*
* The cluster corresponding to page_nr will be used. The cluster will not be
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
@@ -558,9 +677,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si,
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
struct swap_cluster_info *ci;
- if (!cluster_info)
- return;
-
ci = cluster_info + idx;
ci->count++;
@@ -568,63 +684,33 @@ static void inc_cluster_info_page(struct swap_info_struct *si,
VM_BUG_ON(ci->flags);
}
-/*
- * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
- * which means no page in the cluster is in use, we can optionally discard
- * the cluster and add it to free cluster list.
- */
-static void dec_cluster_info_page(struct swap_info_struct *si,
- struct swap_cluster_info *ci, int nr_pages)
-{
- if (!si->cluster_info)
- return;
-
- VM_BUG_ON(ci->count < nr_pages);
- VM_BUG_ON(cluster_is_free(ci));
- lockdep_assert_held(&si->lock);
- lockdep_assert_held(&ci->lock);
- ci->count -= nr_pages;
-
- if (!ci->count) {
- free_cluster(si, ci);
- return;
- }
-
- if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
- VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
- list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]);
- ci->flags = CLUSTER_FLAG_NONFULL;
- }
-}
-
static bool cluster_reclaim_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
unsigned long start, unsigned long end)
{
unsigned char *map = si->swap_map;
- unsigned long offset;
+ unsigned long offset = start;
+ int nr_reclaim;
spin_unlock(&ci->lock);
- spin_unlock(&si->lock);
-
- for (offset = start; offset < end; offset++) {
+ do {
switch (READ_ONCE(map[offset])) {
case 0:
- continue;
+ offset++;
+ break;
case SWAP_HAS_CACHE:
- if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
- continue;
- goto out;
+ nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
+ if (nr_reclaim > 0)
+ offset += nr_reclaim;
+ else
+ goto out;
+ break;
default:
goto out;
}
- }
+ } while (offset < end);
out:
- spin_lock(&si->lock);
spin_lock(&ci->lock);
-
/*
* Recheck the range no matter reclaim succeeded or not, the slot
* could have been be freed while we are not holding the lock.
@@ -638,11 +724,11 @@ out:
static bool cluster_scan_range(struct swap_info_struct *si,
struct swap_cluster_info *ci,
- unsigned long start, unsigned int nr_pages)
+ unsigned long start, unsigned int nr_pages,
+ bool *need_reclaim)
{
unsigned long offset, end = start + nr_pages;
unsigned char *map = si->swap_map;
- bool need_reclaim = false;
for (offset = start; offset < end; offset++) {
switch (READ_ONCE(map[offset])) {
@@ -651,16 +737,13 @@ static bool cluster_scan_range(struct swap_info_struct *si,
case SWAP_HAS_CACHE:
if (!vm_swap_full())
return false;
- need_reclaim = true;
+ *need_reclaim = true;
continue;
default:
return false;
}
}
- if (need_reclaim)
- return cluster_reclaim_range(si, ci, start, end);
-
return true;
}
@@ -670,73 +753,79 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
{
unsigned int nr_pages = 1 << order;
+ lockdep_assert_held(&ci->lock);
+
if (!(si->flags & SWP_WRITEOK))
return false;
- if (cluster_is_free(ci)) {
- if (nr_pages < SWAPFILE_CLUSTER) {
- list_move_tail(&ci->list, &si->nonfull_clusters[order]);
- ci->flags = CLUSTER_FLAG_NONFULL;
- }
+ /*
+ * The first allocation in a cluster makes the
+ * cluster exclusive to this order
+ */
+ if (cluster_is_empty(ci))
ci->order = order;
- }
memset(si->swap_map + start, usage, nr_pages);
- swap_range_alloc(si, start, nr_pages);
+ swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
- if (ci->count == SWAPFILE_CLUSTER) {
- VM_BUG_ON(!(ci->flags &
- (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
- if (ci->flags & CLUSTER_FLAG_FRAG)
- si->frag_cluster_nr[ci->order]--;
- list_move_tail(&ci->list, &si->full_clusters);
- ci->flags = CLUSTER_FLAG_FULL;
- }
-
return true;
}
-static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
- unsigned int *foundp, unsigned int order,
+/* Try use a new cluster for current CPU and allocate from it. */
+static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long offset,
+ unsigned int order,
unsigned char usage)
{
- unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
+ unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
+ unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
unsigned int nr_pages = 1 << order;
- struct swap_cluster_info *ci;
+ bool need_reclaim, ret;
- if (end < nr_pages)
- return SWAP_NEXT_INVALID;
- end -= nr_pages;
+ lockdep_assert_held(&ci->lock);
- ci = lock_cluster(si, offset);
- if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
+ if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
+ goto out;
- while (offset <= end) {
- if (cluster_scan_range(si, ci, offset, nr_pages)) {
- if (!cluster_alloc_range(si, ci, offset, usage, order)) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
- *foundp = offset;
- if (ci->count == SWAPFILE_CLUSTER) {
- offset = SWAP_NEXT_INVALID;
- goto done;
- }
- offset += nr_pages;
- break;
+ for (end -= nr_pages; offset <= end; offset += nr_pages) {
+ need_reclaim = false;
+ if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
+ continue;
+ if (need_reclaim) {
+ ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages);
+ /*
+ * Reclaim drops ci->lock and cluster could be used
+ * by another order. Not checking flag as off-list
+ * cluster has no flag set, and change of list
+ * won't cause fragmentation.
+ */
+ if (!cluster_is_usable(ci, order))
+ goto out;
+ if (cluster_is_empty(ci))
+ offset = start;
+ /* Reclaim failed but cluster is usable, try next */
+ if (!ret)
+ continue;
}
+ if (!cluster_alloc_range(si, ci, offset, usage, order))
+ break;
+ found = offset;
offset += nr_pages;
+ if (ci->count < SWAPFILE_CLUSTER && offset <= end)
+ next = offset;
+ break;
}
- if (offset > end)
- offset = SWAP_NEXT_INVALID;
-done:
+out:
+ relocate_cluster(si, ci);
unlock_cluster(ci);
- return offset;
+ if (si->flags & SWP_SOLIDSTATE)
+ __this_cpu_write(si->percpu_cluster->next[order], next);
+ else
+ si->global_cluster->next[order] = next;
+ return found;
}
/* Return true if reclaimed a whole cluster */
@@ -749,20 +838,19 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
int nr_reclaim;
if (force)
- to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+ to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while (!list_empty(&si->full_clusters)) {
- ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
- list_move_tail(&ci->list, &si->full_clusters);
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
- spin_unlock(&si->lock);
while (offset < end) {
if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+ spin_unlock(&ci->lock);
nr_reclaim = __try_to_reclaim_swap(si, offset,
TTRS_ANYWAY | TTRS_DIRECT);
+ spin_lock(&ci->lock);
if (nr_reclaim) {
offset += abs(nr_reclaim);
continue;
@@ -770,8 +858,12 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
}
offset++;
}
- spin_lock(&si->lock);
+ /* in case no swap cache is reclaimed */
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ relocate_cluster(si, ci);
+
+ unlock_cluster(ci);
if (to_scan <= 0)
break;
}
@@ -783,9 +875,7 @@ static void swap_reclaim_work(struct work_struct *work)
si = container_of(work, struct swap_info_struct, reclaim_work);
- spin_lock(&si->lock);
swap_reclaim_full_clusters(si, true);
- spin_unlock(&si->lock);
}
/*
@@ -796,29 +886,41 @@ static void swap_reclaim_work(struct work_struct *work)
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
unsigned char usage)
{
- struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
unsigned int offset, found = 0;
-new_cluster:
- lockdep_assert_held(&si->lock);
- cluster = this_cpu_ptr(si->percpu_cluster);
- offset = cluster->next[order];
+ if (si->flags & SWP_SOLIDSTATE) {
+ /* Fast path using per CPU cluster */
+ local_lock(&si->percpu_cluster->lock);
+ offset = __this_cpu_read(si->percpu_cluster->next[order]);
+ } else {
+ /* Serialize HDD SWAP allocation for each device. */
+ spin_lock(&si->global_cluster_lock);
+ offset = si->global_cluster->next[order];
+ }
+
if (offset) {
- offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
+ ci = lock_cluster(si, offset);
+ /* Cluster could have been used by another order */
+ if (cluster_is_usable(ci, order)) {
+ if (cluster_is_empty(ci))
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset,
+ order, usage);
+ } else {
+ unlock_cluster(ci);
+ }
if (found)
goto done;
}
- if (!list_empty(&si->free_clusters)) {
- ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
- /*
- * Either we didn't touch the cluster due to swapoff,
- * or the allocation must success.
- */
- VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
- goto done;
+new_cluster:
+ ci = isolate_lock_cluster(si, &si->free_clusters);
+ if (ci) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
+ if (found)
+ goto done;
}
/* Try reclaim from full clusters if free clusters list is drained */
@@ -826,56 +928,42 @@ new_cluster:
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0;
-
- while (!list_empty(&si->nonfull_clusters[order])) {
- ci = list_first_entry(&si->nonfull_clusters[order],
- struct swap_cluster_info, list);
- list_move_tail(&ci->list, &si->frag_clusters[order]);
- ci->flags = CLUSTER_FLAG_FRAG;
- si->frag_cluster_nr[order]++;
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, order, usage);
- frags++;
+ unsigned int frags = 0, frags_existing;
+
+ while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
if (found)
- break;
+ goto done;
+ /* Clusters failed to allocate are moved to frag_clusters */
+ frags++;
}
- if (!found) {
+ frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
+ while (frags < frags_existing &&
+ (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
+ atomic_long_dec(&si->frag_cluster_nr[order]);
/*
- * Nonfull clusters are moved to frag tail if we reached
- * here, count them too, don't over scan the frag list.
+ * Rotate the frag list to iterate, they were all
+ * failing high order allocation or moved here due to
+ * per-CPU usage, but they could contain newly released
+ * reclaimable (eg. lazy-freed swap cache) slots.
*/
- while (frags < si->frag_cluster_nr[order]) {
- ci = list_first_entry(&si->frag_clusters[order],
- struct swap_cluster_info, list);
- /*
- * Rotate the frag list to iterate, they were all failing
- * high order allocation or moved here due to per-CPU usage,
- * this help keeping usable cluster ahead.
- */
- list_move_tail(&ci->list, &si->frag_clusters[order]);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, order, usage);
- frags++;
- if (found)
- break;
- }
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ order, usage);
+ if (found)
+ goto done;
+ frags++;
}
}
- if (found)
- goto done;
-
- if (!list_empty(&si->discard_clusters)) {
- /*
- * we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them, then
- * reread cluster_next_cpu since we dropped si->lock
- */
- swap_do_scheduled_discard(si);
+ /*
+ * We don't have free cluster but have some clusters in
+ * discarding, do discard now and reclaim them, then
+ * reread cluster_next_cpu since we dropped si->lock
+ */
+ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
goto new_cluster;
- }
if (order)
goto done;
@@ -886,74 +974,151 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while (!list_empty(&si->frag_clusters[o])) {
- ci = list_first_entry(&si->frag_clusters[o],
- struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, 0, usage);
+ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
+ atomic_long_dec(&si->frag_cluster_nr[o]);
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ 0, usage);
if (found)
goto done;
}
- while (!list_empty(&si->nonfull_clusters[o])) {
- ci = list_first_entry(&si->nonfull_clusters[o],
- struct swap_cluster_info, list);
- offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
- &found, 0, usage);
+ while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
+ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
+ 0, usage);
if (found)
goto done;
}
}
-
done:
- cluster->next[order] = offset;
+ if (si->flags & SWP_SOLIDSTATE)
+ local_unlock(&si->percpu_cluster->lock);
+ else
+ spin_unlock(&si->global_cluster_lock);
return found;
}
-static void __del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
+static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
int nid;
+ unsigned long pages;
+
+ spin_lock(&swap_avail_lock);
+
+ if (swapoff) {
+ /*
+ * Forcefully remove it. Clear the SWP_WRITEOK flags for
+ * swapoff here so it's synchronized by both si->lock and
+ * swap_avail_lock, to ensure the result can be seen by
+ * add_to_avail_list.
+ */
+ lockdep_assert_held(&si->lock);
+ si->flags &= ~SWP_WRITEOK;
+ atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+ } else {
+ /*
+ * If not called by swapoff, take it off-list only if it's
+ * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
+ * si->inuse_pages == pages), any concurrent slot freeing,
+ * or device already removed from plist by someone else
+ * will make this return false.
+ */
+ pages = si->pages;
+ if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
+ pages | SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+ }
- assert_spin_locked(&si->lock);
for_each_node(nid)
plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
+ spin_unlock(&swap_avail_lock);
}
-static void del_from_avail_list(struct swap_info_struct *si)
+/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
+static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
+ int nid;
+ long val;
+ unsigned long pages;
+
spin_lock(&swap_avail_lock);
- __del_from_avail_list(si);
+
+ /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
+ if (swapon) {
+ lockdep_assert_held(&si->lock);
+ si->flags |= SWP_WRITEOK;
+ } else {
+ if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
+ goto skip;
+ }
+
+ if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+
+ val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
+
+ /*
+ * When device is full and device is on the plist, only one updater will
+ * see (inuse_pages == si->pages) and will call del_from_avail_list. If
+ * that updater happen to be here, just skip adding.
+ */
+ pages = si->pages;
+ if (val == pages) {
+ /* Just like the cmpxchg in del_from_avail_list */
+ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
+ pages | SWAP_USAGE_OFFLIST_BIT))
+ goto skip;
+ }
+
+ for_each_node(nid)
+ plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+
+skip:
spin_unlock(&swap_avail_lock);
}
-static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
- unsigned int nr_entries)
+/*
+ * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
+ * within each cluster, so the total contribution to the global counter should
+ * always be positive and cannot exceed the total number of usable slots.
+ */
+static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
{
- unsigned int end = offset + nr_entries - 1;
+ long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
- if (offset == si->lowest_bit)
- si->lowest_bit += nr_entries;
- if (end == si->highest_bit)
- WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- del_from_avail_list(si);
-
- if (si->cluster_info && vm_swap_full())
- schedule_work(&si->reclaim_work);
+ /*
+ * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
+ * remove it from the plist.
+ */
+ if (unlikely(val == si->pages)) {
+ del_from_avail_list(si, false);
+ return true;
}
+
+ return false;
}
-static void add_to_avail_list(struct swap_info_struct *si)
+static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
{
- int nid;
+ long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
- spin_lock(&swap_avail_lock);
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
- spin_unlock(&swap_avail_lock);
+ /*
+ * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
+ * remove it from the plist.
+ */
+ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
+ add_to_avail_list(si, false);
+}
+
+static void swap_range_alloc(struct swap_info_struct *si,
+ unsigned int nr_entries)
+{
+ if (swap_usage_add(si, nr_entries)) {
+ if (vm_swap_full())
+ schedule_work(&si->reclaim_work);
+ }
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -968,18 +1133,11 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
* Use atomic clear_bit operations only on zeromap instead of non-atomic
* bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
clear_bit(offset + i, si->zeromap);
-
- if (offset < si->lowest_bit)
- si->lowest_bit = offset;
- if (end > si->highest_bit) {
- bool was_full = !si->highest_bit;
-
- WRITE_ONCE(si->highest_bit, end);
- if (was_full && (si->flags & SWP_WRITEOK))
- add_to_avail_list(si);
+ zswap_invalidate(swp_entry(si->type, offset + i));
}
+
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -999,50 +1157,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
*/
smp_wmb();
atomic_long_add(nr_entries, &nr_swap_pages);
- WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
-}
-
-static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
-{
- unsigned long prev;
-
- if (!(si->flags & SWP_SOLIDSTATE)) {
- si->cluster_next = next;
- return;
- }
-
- prev = this_cpu_read(*si->cluster_next_cpu);
- /*
- * Cross the swap address space size aligned trunk, choose
- * another trunk randomly to avoid lock contention on swap
- * address space if possible.
- */
- if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
- (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
- /* No free swap slots available */
- if (si->highest_bit <= si->lowest_bit)
- return;
- next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
- next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
- next = max_t(unsigned int, next, si->lowest_bit);
- }
- this_cpu_write(*si->cluster_next_cpu, next);
-}
-
-static bool swap_offset_available_and_locked(struct swap_info_struct *si,
- unsigned long offset)
-{
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- return true;
- }
-
- if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- return true;
- }
-
- return false;
+ swap_usage_sub(si, nr_entries);
}
static int cluster_alloc_swap(struct swap_info_struct *si,
@@ -1051,10 +1166,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
{
int n_ret = 0;
- VM_BUG_ON(!si->cluster_info);
-
- si->flags += SWP_SCANNING;
-
while (n_ret < nr) {
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
@@ -1063,8 +1174,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
slots[n_ret++] = swp_entry(si->type, offset);
}
- si->flags -= SWP_SCANNING;
-
return n_ret;
}
@@ -1072,13 +1181,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[], int order)
{
- unsigned long offset;
- unsigned long scan_base;
- unsigned long last_in_cluster = 0;
- int latency_ration = LATENCY_LIMIT;
unsigned int nr_pages = 1 << order;
- int n_ret = 0;
- bool scanned_many = false;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -1090,7 +1193,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
* But we do now try to find an empty cluster. -Andrea
* And we let swap pages go all over an SSD partition. Hugh
*/
-
if (order > 0) {
/*
* Should not even be attempting large allocations when huge
@@ -1103,165 +1205,30 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
}
/*
- * Swapfile is not block device or not using clusters so unable
+ * Swapfile is not block device so unable
* to allocate large entries.
*/
- if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
+ if (!(si->flags & SWP_BLKDEV))
return 0;
}
- if (si->cluster_info)
- return cluster_alloc_swap(si, usage, nr, slots, order);
-
- si->flags += SWP_SCANNING;
-
- /* For HDD, sequential access is more important. */
- scan_base = si->cluster_next;
- offset = scan_base;
-
- if (unlikely(!si->cluster_nr--)) {
- if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
-
- spin_unlock(&si->lock);
-
- /*
- * If seek is expensive, start searching for new cluster from
- * start of partition, to minimize the span of allocated swap.
- */
- scan_base = offset = si->lowest_bit;
- last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-
- /* Locate the first empty (unaligned) cluster */
- for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) {
- if (si->swap_map[offset])
- last_in_cluster = offset + SWAPFILE_CLUSTER;
- else if (offset == last_in_cluster) {
- spin_lock(&si->lock);
- offset -= SWAPFILE_CLUSTER - 1;
- si->cluster_next = offset;
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- }
- }
-
- offset = scan_base;
- spin_lock(&si->lock);
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- }
-
-checks:
- if (!(si->flags & SWP_WRITEOK))
- goto no_page;
- if (!si->highest_bit)
- goto no_page;
- if (offset > si->highest_bit)
- scan_base = offset = si->lowest_bit;
-
- /* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- int swap_was_freed;
- spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
- spin_lock(&si->lock);
- /* entry was freed successfully, try to use this again */
- if (swap_was_freed > 0)
- goto checks;
- goto scan; /* check next one */
- }
-
- if (si->swap_map[offset]) {
- if (!n_ret)
- goto scan;
- else
- goto done;
- }
- memset(si->swap_map + offset, usage, nr_pages);
-
- swap_range_alloc(si, offset, nr_pages);
- slots[n_ret++] = swp_entry(si->type, offset);
-
- /* got enough slots or reach max slots? */
- if ((n_ret == nr) || (offset >= si->highest_bit))
- goto done;
-
- /* search for next available slot */
-
- /* time to take a break? */
- if (unlikely(--latency_ration < 0)) {
- if (n_ret)
- goto done;
- spin_unlock(&si->lock);
- cond_resched();
- spin_lock(&si->lock);
- latency_ration = LATENCY_LIMIT;
- }
-
- if (si->cluster_nr && !si->swap_map[++offset]) {
- /* non-ssd case, still more slots in cluster? */
- --si->cluster_nr;
- goto checks;
- }
+ return cluster_alloc_swap(si, usage, nr, slots, order);
+}
+static bool get_swap_device_info(struct swap_info_struct *si)
+{
+ if (!percpu_ref_tryget_live(&si->users))
+ return false;
/*
- * Even if there's no free clusters available (fragmented),
- * try to scan a little more quickly with lock held unless we
- * have scanned too many slots already.
+ * Guarantee the si->users are checked before accessing other
+ * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
+ * up to dated.
+ *
+ * Paired with the spin_unlock() after setup_swap_info() in
+ * enable_swap_info(), and smp_wmb() in swapoff.
*/
- if (!scanned_many) {
- unsigned long scan_limit;
-
- if (offset < scan_base)
- scan_limit = scan_base;
- else
- scan_limit = si->highest_bit;
- for (; offset <= scan_limit && --latency_ration > 0;
- offset++) {
- if (!si->swap_map[offset])
- goto checks;
- }
- }
-
-done:
- if (order == 0)
- set_cluster_next(si, offset + 1);
- si->flags -= SWP_SCANNING;
- return n_ret;
-
-scan:
- VM_WARN_ON(order > 0);
- spin_unlock(&si->lock);
- while (++offset <= READ_ONCE(si->highest_bit)) {
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- scanned_many = true;
- }
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
- }
- offset = si->lowest_bit;
- while (offset < scan_base) {
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- scanned_many = true;
- }
- if (swap_offset_available_and_locked(si, offset))
- goto checks;
- offset++;
- }
- spin_lock(&si->lock);
-
-no_page:
- si->flags -= SWP_SCANNING;
- return n_ret;
+ smp_rmb();
+ return true;
}
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
@@ -1291,32 +1258,15 @@ start_over:
/* requeue si to after same-priority siblings */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
- spin_lock(&si->lock);
- if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_lists[node])) {
- spin_unlock(&si->lock);
- goto nextsi;
- }
- WARN(!si->highest_bit,
- "swap_info %d in list but !highest_bit\n",
- si->type);
- WARN(!(si->flags & SWP_WRITEOK),
- "swap_info %d in list but !SWP_WRITEOK\n",
- si->type);
- __del_from_avail_list(si);
- spin_unlock(&si->lock);
- goto nextsi;
+ if (get_swap_device_info(si)) {
+ n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+ n_goal, swp_entries, order);
+ put_swap_device(si);
+ if (n_ret || size > 1)
+ goto check_out;
}
- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
- n_goal, swp_entries, order);
- spin_unlock(&si->lock);
- if (n_ret || size > 1)
- goto check_out;
- cond_resched();
spin_lock(&swap_avail_lock);
-nextsi:
/*
* if we got here, it's likely that si was almost full before,
* and since scan_swap_map_slots() can drop the si->lock,
@@ -1376,22 +1326,6 @@ out:
return NULL;
}
-static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
- struct swap_info_struct *q)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
-
- if (p != q) {
- if (q != NULL)
- spin_unlock(&q->lock);
- if (p != NULL)
- spin_lock(&p->lock);
- }
- return p;
-}
-
static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
unsigned long offset,
unsigned char usage)
@@ -1481,16 +1415,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
- if (!percpu_ref_tryget_live(&si->users))
+ if (!get_swap_device_info(si))
goto out;
- /*
- * Guarantee the si->users are checked before accessing other
- * fields of swap_info_struct.
- *
- * Paired with the spin_unlock() after setup_swap_info() in
- * enable_swap_info().
- */
- smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
goto put_out;
@@ -1513,11 +1439,11 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si,
unsigned long offset = swp_offset(entry);
unsigned char usage;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
usage = __swap_entry_free_locked(si, offset, 1);
- unlock_cluster_or_swap_info(si, ci);
if (!usage)
- free_swap_slot(entry);
+ swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
+ unlock_cluster(ci);
return usage;
}
@@ -1538,22 +1464,17 @@ static bool __swap_entries_free(struct swap_info_struct *si,
if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
goto fallback;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
goto fallback;
}
for (i = 0; i < nr; i++)
WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
- unlock_cluster_or_swap_info(si, ci);
+ if (!has_cache)
+ swap_entry_range_free(si, ci, entry, nr);
+ unlock_cluster(ci);
- if (!has_cache) {
- for (i = 0; i < nr; i++)
- zswap_invalidate(swp_entry(si->type, offset + i));
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, nr);
- spin_unlock(&si->lock);
- }
return has_cache;
fallback:
@@ -1573,24 +1494,32 @@ fallback:
* Drop the last HAS_CACHE flag of swap entries, caller have to
* ensure all entries belong to the same cgroup.
*/
-static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
- unsigned int nr_pages)
+static void swap_entry_range_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
unsigned char *map_end = map + nr_pages;
- struct swap_cluster_info *ci;
- ci = lock_cluster(si, offset);
+ /* It should never free entries across different clusters */
+ VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
+ VM_BUG_ON(cluster_is_empty(ci));
+ VM_BUG_ON(ci->count < nr_pages);
+
+ ci->count -= nr_pages;
do {
VM_BUG_ON(*map != SWAP_HAS_CACHE);
*map = 0;
} while (++map < map_end);
- dec_cluster_info_page(si, ci, nr_pages);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
+
+ if (!ci->count)
+ free_cluster(si, ci);
+ else
+ partial_free_cluster(si, ci);
}
static void cluster_swap_free_nr(struct swap_info_struct *si,
@@ -1598,29 +1527,14 @@ static void cluster_swap_free_nr(struct swap_info_struct *si,
unsigned char usage)
{
struct swap_cluster_info *ci;
- DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 };
- int i, nr;
+ unsigned long end = offset + nr_pages;
- ci = lock_cluster_or_swap_info(si, offset);
- while (nr_pages) {
- nr = min(BITS_PER_LONG, nr_pages);
- for (i = 0; i < nr; i++) {
- if (!__swap_entry_free_locked(si, offset + i, usage))
- bitmap_set(to_free, i, 1);
- }
- if (!bitmap_empty(to_free, BITS_PER_LONG)) {
- unlock_cluster_or_swap_info(si, ci);
- for_each_set_bit(i, to_free, BITS_PER_LONG)
- free_swap_slot(swp_entry(si->type, offset + i));
- if (nr == nr_pages)
- return;
- bitmap_clear(to_free, 0, BITS_PER_LONG);
- ci = lock_cluster_or_swap_info(si, offset);
- }
- offset += nr;
- nr_pages -= nr;
- }
- unlock_cluster_or_swap_info(si, ci);
+ ci = lock_cluster(si, offset);
+ do {
+ if (!__swap_entry_free_locked(si, offset, usage))
+ swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
+ } while (++offset < end);
+ unlock_cluster(ci);
}
/*
@@ -1659,59 +1573,35 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster_or_swap_info(si, offset);
- if (size > 1 && swap_is_has_cache(si, offset, size)) {
- unlock_cluster_or_swap_info(si, ci);
- spin_lock(&si->lock);
- swap_entry_range_free(si, entry, size);
- spin_unlock(&si->lock);
- return;
- }
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
- unlock_cluster_or_swap_info(si, ci);
- free_swap_slot(entry);
- if (i == size - 1)
- return;
- lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
+ if (swap_is_has_cache(si, offset, size))
+ swap_entry_range_free(si, ci, entry, size);
+ else {
+ for (int i = 0; i < size; i++, entry.val++) {
+ if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE))
+ swap_entry_range_free(si, ci, entry, 1);
}
}
- unlock_cluster_or_swap_info(si, ci);
-}
-
-static int swp_entry_cmp(const void *ent1, const void *ent2)
-{
- const swp_entry_t *e1 = ent1, *e2 = ent2;
-
- return (int)swp_type(*e1) - (int)swp_type(*e2);
+ unlock_cluster(ci);
}
void swapcache_free_entries(swp_entry_t *entries, int n)
{
- struct swap_info_struct *p, *prev;
int i;
+ struct swap_cluster_info *ci;
+ struct swap_info_struct *si = NULL;
if (n <= 0)
return;
- prev = NULL;
- p = NULL;
-
- /*
- * Sort swap entries by swap device, so each lock is only taken once.
- * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
- * so low that it isn't necessary to optimize further.
- */
- if (nr_swapfiles > 1)
- sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
for (i = 0; i < n; ++i) {
- p = swap_info_get_cont(entries[i], prev);
- if (p)
- swap_entry_range_free(p, entries[i], 1);
- prev = p;
+ si = _swap_info_get(entries[i]);
+ if (si) {
+ ci = lock_cluster(si, swp_offset(entries[i]));
+ swap_entry_range_free(si, ci, entries[i], 1);
+ unlock_cluster(ci);
+ }
}
- if (p)
- spin_unlock(&p->lock);
}
int __swap_count(swp_entry_t entry)
@@ -1733,9 +1623,9 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
struct swap_cluster_info *ci;
int count;
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return count;
}
@@ -1758,7 +1648,7 @@ int swp_swapcount(swp_entry_t entry)
offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
@@ -1781,7 +1671,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return count;
}
@@ -1796,8 +1686,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
int i;
bool ret = false;
- ci = lock_cluster_or_swap_info(si, offset);
- if (!ci || nr_pages == 1) {
+ ci = lock_cluster(si, offset);
+ if (nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
@@ -1809,7 +1699,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
}
}
unlock_out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return ret;
}
@@ -1963,10 +1853,11 @@ swp_entry_t get_swap_page_of_type(int type)
goto fail;
/* This is called for allocating swap entry, not cache */
- spin_lock(&si->lock);
- if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
- atomic_long_dec(&nr_swap_pages);
- spin_unlock(&si->lock);
+ if (get_swap_device_info(si)) {
+ if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
+ atomic_long_dec(&nr_swap_pages);
+ put_swap_device(si);
+ }
fail:
return entry;
}
@@ -2057,7 +1948,7 @@ unsigned int count_swap_pages(int type, int free)
if (sis->flags & SWP_WRITEOK) {
n = sis->pages;
if (free)
- n -= sis->inuse_pages;
+ n -= swap_usage_in_pages(sis);
}
spin_unlock(&sis->lock);
}
@@ -2392,7 +2283,7 @@ static int try_to_unuse(unsigned int type)
swp_entry_t entry;
unsigned int i;
- if (!READ_ONCE(si->inuse_pages))
+ if (!swap_usage_in_pages(si))
goto success;
retry:
@@ -2405,7 +2296,7 @@ retry:
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
@@ -2433,7 +2324,7 @@ retry:
mmput(prev_mm);
i = 0;
- while (READ_ONCE(si->inuse_pages) &&
+ while (swap_usage_in_pages(si) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i)) != 0) {
@@ -2468,7 +2359,7 @@ retry:
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
- if (READ_ONCE(si->inuse_pages)) {
+ if (swap_usage_in_pages(si)) {
if (!signal_pending(current))
goto retry;
return -EINTR;
@@ -2495,7 +2386,7 @@ static void drain_mmlist(void)
unsigned int type;
for (type = 0; type < nr_swapfiles; type++)
- if (swap_info[type]->inuse_pages)
+ if (swap_usage_in_pages(swap_info[type]))
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -2674,7 +2565,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
static void _enable_swap_info(struct swap_info_struct *si)
{
- si->flags |= SWP_WRITEOK;
atomic_long_add(si->pages, &nr_swap_pages);
total_swap_pages += si->pages;
@@ -2691,9 +2581,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
*/
plist_add(&si->list, &swap_active_head);
- /* add to available list iff swap device is not full */
- if (si->highest_bit)
- add_to_avail_list(si);
+ /* Add back to available list */
+ add_to_avail_list(si, true);
}
static void enable_swap_info(struct swap_info_struct *si, int prio,
@@ -2742,6 +2631,24 @@ bool has_usable_swap(void)
return ret;
}
+/*
+ * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
+ * see the updated flags, so there will be no more allocations.
+ */
+static void wait_for_allocation(struct swap_info_struct *si)
+{
+ unsigned long offset;
+ unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
+ struct swap_cluster_info *ci;
+
+ BUG_ON(si->flags & SWP_WRITEOK);
+
+ for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
+ ci = lock_cluster(si, offset);
+ unlock_cluster(ci);
+ }
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -2791,7 +2698,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out_dput;
}
spin_lock(&p->lock);
- del_from_avail_list(p);
+ del_from_avail_list(p, true);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
@@ -2809,10 +2716,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
- p->flags &= ~SWP_WRITEOK;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
+ wait_for_allocation(p);
+
disable_swap_slots_cache_lock();
set_current_oom_origin();
@@ -2855,16 +2763,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
- /* wait for anyone still in scan_swap_map_slots */
- p->highest_bit = 0; /* cuts scans short */
- while (p->flags >= SWP_SCANNING) {
- spin_unlock(&p->lock);
- spin_unlock(&swap_lock);
- schedule_timeout_uninterruptible(1);
- spin_lock(&swap_lock);
- spin_lock(&p->lock);
- }
-
swap_file = p->swap_file;
p->swap_file = NULL;
p->max = 0;
@@ -2881,8 +2779,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
- free_percpu(p->cluster_next_cpu);
- p->cluster_next_cpu = NULL;
+ kfree(p->global_cluster);
+ p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
kvfree(cluster_info);
@@ -2992,7 +2890,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
bytes = K(si->pages);
- inuse = K(READ_ONCE(si->inuse_pages));
+ inuse = K(swap_usage_in_pages(si));
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -3109,6 +3007,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
+ atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
init_completion(&p->comp);
return p;
@@ -3193,10 +3092,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
return 0;
}
- si->lowest_bit = 1;
- si->cluster_next = 1;
- si->cluster_nr = 0;
-
maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
if (!last_page) {
@@ -3213,7 +3108,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
if ((unsigned int)maxpages == 0)
maxpages = UINT_MAX;
}
- si->highest_bit = maxpages - 1;
if (!maxpages)
return 0;
@@ -3281,7 +3175,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
struct swap_cluster_info *cluster_info;
unsigned long i, j, k, idx;
int cpu, err = -ENOMEM;
@@ -3293,25 +3186,27 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- si->cluster_next_cpu = alloc_percpu(unsigned int);
- if (!si->cluster_next_cpu)
- goto err_free;
-
- /* Random start position to help with wear leveling */
- for_each_possible_cpu(cpu)
- per_cpu(*si->cluster_next_cpu, cpu) =
- get_random_u32_inclusive(1, si->highest_bit);
+ if (si->flags & SWP_SOLIDSTATE) {
+ si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!si->percpu_cluster)
+ goto err_free;
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cluster->next[i] = SWAP_ENTRY_INVALID;
+ local_lock_init(&cluster->lock);
+ }
+ } else {
+ si->global_cluster = kmalloc(sizeof(*si->global_cluster),
+ GFP_KERNEL);
+ if (!si->global_cluster)
+ goto err_free;
for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_NEXT_INVALID;
+ si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
+ spin_lock_init(&si->global_cluster_lock);
}
/*
@@ -3335,7 +3230,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- si->frag_cluster_nr[i] = 0;
+ atomic_long_set(&si->frag_cluster_nr[i], 0);
}
/*
@@ -3343,7 +3238,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* sharing same address space.
*/
for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
- j = (k + col) % SWAP_CLUSTER_COLS;
+ j = k % SWAP_CLUSTER_COLS;
for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
struct swap_cluster_info *ci;
idx = i * SWAP_CLUSTER_COLS + j;
@@ -3493,18 +3388,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (si->bdev && bdev_nonrot(si->bdev)) {
si->flags |= SWP_SOLIDSTATE;
-
- cluster_info = setup_clusters(si, swap_header, maxpages);
- if (IS_ERR(cluster_info)) {
- error = PTR_ERR(cluster_info);
- cluster_info = NULL;
- goto bad_swap_unlock_inode;
- }
} else {
atomic_inc(&nr_rotate_swap);
inced_nr_rotate_swap = true;
}
+ cluster_info = setup_clusters(si, swap_header, maxpages);
+ if (IS_ERR(cluster_info)) {
+ error = PTR_ERR(cluster_info);
+ cluster_info = NULL;
+ goto bad_swap_unlock_inode;
+ }
+
if ((swap_flags & SWAP_FLAG_DISCARD) &&
si->bdev && bdev_max_discard_sectors(si->bdev)) {
/*
@@ -3585,8 +3480,8 @@ bad_swap_unlock_inode:
bad_swap:
free_percpu(si->percpu_cluster);
si->percpu_cluster = NULL;
- free_percpu(si->cluster_next_cpu);
- si->cluster_next_cpu = NULL;
+ kfree(si->global_cluster);
+ si->global_cluster = NULL;
inode = NULL;
destroy_swap_extents(si);
swap_cgroup_swapoff(si->type);
@@ -3623,7 +3518,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += READ_ONCE(si->inuse_pages);
+ nr_to_be_unused += swap_usage_in_pages(si);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -3651,11 +3546,15 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
VM_WARN_ON(usage == 1 && nr > 1);
- ci = lock_cluster_or_swap_info(si, offset);
+ ci = lock_cluster(si, offset);
err = 0;
for (i = 0; i < nr; i++) {
@@ -3710,7 +3609,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
}
unlock_out:
- unlock_cluster_or_swap_info(si, ci);
+ unlock_cluster(ci);
return err;
}
@@ -3819,7 +3718,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
*/
goto outer;
}
- spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3884,7 +3782,6 @@ out_unlock_cont:
spin_unlock(&si->cont_lock);
out:
unlock_cluster(ci);
- spin_unlock(&si->lock);
put_swap_device(si);
outer:
if (page)
diff --git a/mm/truncate.c b/mm/truncate.c
index 7c304d2f0052..76d8fcd89bd0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -525,6 +525,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(invalidate_mapping_pages);
+static int folio_launder(struct address_space *mapping, struct folio *folio)
+{
+ if (!folio_test_dirty(folio))
+ return 0;
+ if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
+ return 0;
+ return mapping->a_ops->launder_folio(folio);
+}
+
/*
* This is like mapping_evict_folio(), except it ignores the folio's
* refcount. We do this because invalidate_inode_pages2() needs stronger
@@ -532,14 +541,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* shrink_folio_list() has a temp ref on them, or because they're transiently
* sitting in the folio_add_lru() caches.
*/
-static int invalidate_complete_folio2(struct address_space *mapping,
- struct folio *folio)
+int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
+ gfp_t gfp)
{
- if (folio->mapping != mapping)
- return 0;
+ int ret;
- if (!filemap_release_folio(folio, GFP_KERNEL))
- return 0;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
+ BUG_ON(folio_mapped(folio));
+
+ ret = folio_launder(mapping, folio);
+ if (ret)
+ return ret;
+ if (folio->mapping != mapping)
+ return -EBUSY;
+ if (!filemap_release_folio(folio, gfp))
+ return -EBUSY;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
@@ -558,16 +577,7 @@ static int invalidate_complete_folio2(struct address_space *mapping,
failed:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
- return 0;
-}
-
-static int folio_launder(struct address_space *mapping, struct folio *folio)
-{
- if (!folio_test_dirty(folio))
- return 0;
- if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
- return 0;
- return mapping->a_ops->launder_folio(folio);
+ return -EBUSY;
}
/**
@@ -631,16 +641,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
}
VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
folio_wait_writeback(folio);
-
- if (folio_mapped(folio))
- unmap_mapping_folio(folio);
- BUG_ON(folio_mapped(folio));
-
- ret2 = folio_launder(mapping, folio);
- if (ret2 == 0) {
- if (!invalidate_complete_folio2(mapping, folio))
- ret2 = -EBUSY;
- }
+ ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
if (ret2 < 0)
ret = ret2;
folio_unlock(folio);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 83c164aba6e0..dbdcc43964fb 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,7 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
@@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
-static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+EXPORT_SYMBOL(validate_usercopy_range);
/*
* Validates that the given object is:
@@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- if (static_branch_unlikely(&bypass_usercopy_checks))
- return;
-
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
}
EXPORT_SYMBOL(__check_object_size);
-static bool enable_checks __initdata = true;
+static bool enable_checks __initdata =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);
static int __init parse_hardened_usercopy(char *str)
{
@@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy);
static int __init set_hardened_usercopy(void)
{
- if (enable_checks == false)
- static_branch_enable(&bypass_usercopy_checks);
+ if (enable_checks)
+ static_branch_enable(&validate_usercopy_range);
+ else
+ static_branch_disable(&validate_usercopy_range);
return 1;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 60a0be33766f..d06453fa8aba 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -1020,6 +1021,14 @@ void double_pt_unlock(spinlock_t *ptl1,
__release(ptl2);
}
+static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval)
+{
+ return pte_same(ptep_get(src_pte), orig_src_pte) &&
+ pte_same(ptep_get(dst_pte), orig_dst_pte) &&
+ pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
+}
static int move_present_pte(struct mm_struct *mm,
struct vm_area_struct *dst_vma,
@@ -1027,6 +1036,7 @@ static int move_present_pte(struct mm_struct *mm,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
struct folio *src_folio)
{
@@ -1034,8 +1044,8 @@ static int move_present_pte(struct mm_struct *mm,
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
err = -EAGAIN;
goto out;
}
@@ -1067,23 +1077,32 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
-
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1097,13 +1116,14 @@ static int move_zeropage_pte(struct mm_struct *mm,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl)
{
pte_t zero_pte;
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
- !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
@@ -1130,12 +1150,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
pmd_t dummy_pmdval;
+ pmd_t dst_pmdval;
struct folio *src_folio = NULL;
struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
@@ -1148,11 +1170,11 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
retry:
/*
* Use the maywrite version to indicate that dst_pte will be modified,
- * but since we will use pte_same() to detect the change of the pte
- * entry, there is no need to get pmdval, so just pass a dummy variable
- * to it.
+ * since dst_pte needs to be none, the subsequent pte_same() check
+ * cannot prevent the dst_pte page from being freed concurrently, so we
+ * also need to abtain dst_pmdval and recheck pmd_same() later.
*/
- dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval,
+ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
&dst_ptl);
/* Retry if a huge pmd materialized from under us */
@@ -1161,7 +1183,11 @@ retry:
goto out;
}
- /* same as dst_pte */
+ /*
+ * Unlike dst_pte, the subsequent pte_same() check can ensure the
+ * stability of the src_pte page, so there is no need to get pmdval,
+ * just pass a dummy variable to it.
+ */
src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
&src_ptl);
@@ -1177,8 +1203,8 @@ retry:
}
/* Sanity checks before the operation */
- if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) ||
- WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
+ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
+ pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
err = -EINVAL;
goto out;
}
@@ -1213,7 +1239,7 @@ retry:
err = move_zeropage_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ dst_pmd, dst_pmdval, dst_ptl, src_ptl);
goto out;
}
@@ -1224,6 +1250,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
+ bool locked;
/*
* Pin the page while holding the lock to be sure the
@@ -1243,14 +1270,28 @@ retry:
goto out;
}
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
+
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
- if (!folio_trylock(src_folio)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1266,8 +1307,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1292,8 +1333,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1303,14 +1344,16 @@ retry:
err = move_present_pte(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl, src_folio);
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1319,10 +1362,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr,
- dst_pte, src_pte,
- orig_dst_pte, orig_src_pte,
- dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio);
}
out:
@@ -1339,6 +1425,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index 60aa40f612b8..e7d81371032b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -23,6 +23,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -582,6 +585,23 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
return ret;
}
+/*
+ * Perform a userland memory mapping into the current process address space. See
+ * the comment for do_mmap() for more details on this operation in general.
+ *
+ * This differs from do_mmap() in that:
+ *
+ * a. An offset parameter is provided rather than pgoff, which is both checked
+ * for overflow and page alignment.
+ * b. mmap locking is performed on the caller's behalf.
+ * c. Userfaultfd unmap events and memory population are handled.
+ *
+ * This means that this function performs essentially the same work as if
+ * userland were invoking mmap (2).
+ *
+ * Returns either an error, or the address at which the requested mapping has
+ * been performed.
+ */
unsigned long vm_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
@@ -595,168 +615,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
-{
- /*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- flags |= __GFP_NOWARN;
-
- if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- flags &= ~__GFP_NOFAIL;
- }
-
- return flags;
-}
-
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
-{
- void *ret;
-
- /*
- * It doesn't really make sense to fallback to vmalloc for sub page
- * requests
- */
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
- kmalloc_gfp_adjust(flags, size),
- node);
- if (ret || size <= PAGE_SIZE)
- return ret;
-
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
- /* Don't even allow crazy sizes */
- if (unlikely(size > INT_MAX)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- /*
- * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
- * since the callers already cannot assume anything
- * about the resulting pointer, and cannot play
- * protection games.
- */
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__kvmalloc_node_noprof);
-
-/**
- * kvfree() - Free memory.
- * @addr: Pointer to allocated memory.
- *
- * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
- * It is slightly more efficient to use kfree() or vfree() if you are certain
- * that you know which one to use.
- *
- * Context: Either preemptible task context or not-NMI interrupt.
- */
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-/**
- * kvfree_sensitive - Free a data object containing sensitive information.
- * @addr: address of the data object to be freed.
- * @len: length of the data object.
- *
- * Use the special memzero_explicit() function to clear the content of a
- * kvmalloc'ed object containing sensitive data to make sure that the
- * compiler won't optimize out the data clearing.
- */
-void kvfree_sensitive(const void *addr, size_t len)
-{
- if (likely(!ZERO_OR_NULL_PTR(addr))) {
- memzero_explicit((void *)addr, len);
- kvfree(addr);
- }
-}
-EXPORT_SYMBOL(kvfree_sensitive);
-
-/**
- * kvrealloc - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @flags: the flags for the page level allocator
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
-{
- void *n;
-
- if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
-
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
- if (!n) {
- /* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
- if (!n)
- return NULL;
-
- if (p) {
- /* We already know that `p` is not a vmalloc address. */
- kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
- kasan_enable_current();
-
- kfree(p);
- }
- }
-
- return n;
-}
-EXPORT_SYMBOL(kvrealloc_noprof);
-
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
diff --git a/mm/vma.c b/mm/vma.c
index bb2119e5a0d0..71ca012c616c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -203,6 +203,38 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
}
/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_lock and by
+ * the root anon_vma's mutex.
+ */
+static void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+/*
* vma_prepare() - Helper function for handling locking VMAs prior to altering
* @vp: The initialized vma_prepare struct
*/
@@ -398,7 +430,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
- lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
@@ -415,8 +446,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
* has already been checked or doesn't make sense to fail.
* VMA Iterator will point to the original VMA.
*/
-static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static __must_check int
+__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
struct vma_prepare vp;
struct vm_area_struct *new;
@@ -511,38 +543,6 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * vma has some anon_vma assigned, and is already inserted on that
- * anon_vma's interval trees.
- *
- * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
- * vma must be removed from the anon_vma's interval trees using
- * anon_vma_interval_tree_pre_update_vma().
- *
- * After the update, the vma will be reinserted using
- * anon_vma_interval_tree_post_update_vma().
- *
- * The entire update must be protected by exclusive mmap_lock and by
- * the root anon_vma's mutex.
- */
-void
-anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
-}
-
-void
-anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
-}
-
-/*
* dup_anon_vma() - Helper function to duplicate anon_vma
* @dst: The destination VMA
* @src: The source VMA
@@ -710,7 +710,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
* - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
*/
-static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
+static __must_check struct vm_area_struct *vma_merge_existing_range(
+ struct vma_merge_struct *vmg)
{
struct vm_area_struct *vma = vmg->vma;
struct vm_area_struct *prev = vmg->prev;
@@ -728,19 +729,20 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
bool expanded;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
- VM_WARN_ON(vmg->next); /* We set this. */
- VM_WARN_ON(prev && start <= prev->vm_start);
- VM_WARN_ON(start >= end);
+ VM_WARN_ON_VMG(!vma, vmg); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
+ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
+ VM_WARN_ON_VMG(start >= end, vmg);
+
/*
* If vma == prev, then we are offset into a VMA. Otherwise, if we are
* not, we must span a portion of the VMA.
*/
- VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
- vmg->end > vma->vm_end));
+ VM_WARN_ON_VMG(vma && ((vma != prev && vmg->start != vma->vm_start) ||
+ vmg->end > vma->vm_end), vmg);
/* The vmi must be positioned within vmg->vma. */
- VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
- vma_iter_addr(vmg->vmi) < vma->vm_end));
+ VM_WARN_ON_VMG(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
+ vma_iter_addr(vmg->vmi) < vma->vm_end), vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -857,9 +859,9 @@ static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
- VM_WARN_ON(!merge_right);
+ VM_WARN_ON_VMG(!merge_right, vmg);
/* If we are offset into a VMA, then prev must be vma. */
- VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+ VM_WARN_ON_VMG(vmg->start > vma->vm_start && prev && vma != prev, vmg);
if (merge_will_delete_vma) {
vmg->vma = next;
@@ -971,9 +973,9 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND;
mmap_assert_write_locked(vmg->mm);
- VM_WARN_ON(vmg->vma);
+ VM_WARN_ON_VMG(vmg->vma, vmg);
/* vmi must point at or before the gap. */
- VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
+ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -1055,7 +1057,7 @@ int vma_expand(struct vma_merge_struct *vmg)
remove_next = true;
/* This should already have been checked by this point. */
- VM_WARN_ON(!can_merge_remove_vma(next));
+ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
ret = dup_anon_vma(vma, next, &anon_dup);
if (ret)
@@ -1063,10 +1065,10 @@ int vma_expand(struct vma_merge_struct *vmg)
}
/* Not merging but overwriting any part of next is not handled. */
- VM_WARN_ON(next && !remove_next &&
- next != vma && vmg->end > next->vm_start);
+ VM_WARN_ON_VMG(next && !remove_next &&
+ next != vma && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
+ VM_WARN_ON_VMG(vma->vm_start < vmg->start || vma->vm_end > vmg->end, vmg);
if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
goto nomem;
@@ -1130,7 +1132,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- lru_add_drain();
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
@@ -1508,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
struct vm_area_struct *vma = vmg->vma;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
+ if (vmg_nomem(vmg))
+ return ERR_PTR(-ENOMEM);
/* Split any preceding portion of the VMA. */
- if (vma->vm_start < vmg->start) {
- int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+ if (vma->vm_start < start) {
+ int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
- if (vma->vm_end > vmg->end) {
- int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+ if (vma->vm_end > end) {
+ int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);
@@ -2376,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
- khugepaged_enter_vma(vma, map->flags);
+ if (!vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma, map->flags);
ksm_add_vma(vma);
*vmap = vma;
return 0;
@@ -2430,7 +2436,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
-unsigned long __mmap_region(struct file *file, unsigned long addr,
+static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
@@ -2481,3 +2487,476 @@ abort_munmap:
vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
return error;
}
+
+/**
+ * mmap_region() - Actually perform the userland mapping of a VMA into
+ * current->mm with known, aligned and overflow-checked @addr and @len, and
+ * correctly determined VMA flags @vm_flags and page offset @pgoff.
+ *
+ * This is an internal memory management function, and should not be used
+ * directly.
+ *
+ * The caller must write-lock current->mm->mmap_lock.
+ *
+ * @file: If a file-backed mapping, a pointer to the struct file describing the
+ * file to be mapped, otherwise NULL.
+ * @addr: The page-aligned address at which to perform the mapping.
+ * @len: The page-aligned, non-zero, length of the mapping.
+ * @vm_flags: The VMA flags which should be applied to the mapping.
+ * @pgoff: If @file is specified, the page offset into the file, if not then
+ * the virtual page offset in memory of the anonymous mapping.
+ * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
+ * events.
+ *
+ * Returns: Either an error, or the address at which the requested mapping has
+ * been performed.
+ */
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ unsigned long ret;
+ bool writable_file_mapping = false;
+
+ mmap_assert_write_locked(current->mm);
+
+ /* Check to see if MDWE is applicable. */
+ if (map_deny_write_exec(vm_flags, vm_flags))
+ return -EACCES;
+
+ /* Allow architectures to sanity-check the vm_flags. */
+ if (!arch_validate_flags(vm_flags))
+ return -EINVAL;
+
+ /* Map writable and ensure this isn't a sealed memfd. */
+ if (file && is_shared_maywrite(vm_flags)) {
+ int error = mapping_map_writable(file->f_mapping);
+
+ if (error)
+ return error;
+ writable_file_mapping = true;
+ }
+
+ ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
+
+ /* Clear our write mapping regardless of error. */
+ if (writable_file_mapping)
+ mapping_unmap_writable(file->f_mapping);
+
+ validate_mm(current->mm);
+ return ret;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @vmi: The vma iterator
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ /*
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
+ */
+ if (vma && vma->vm_end == addr) {
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+
+ vmg.prev = vma;
+ /* vmi is positioned at prev, which this mode expects. */
+ vmg.merge_flags = VMG_FLAG_JUST_EXPAND;
+
+ if (vma_merge_new_range(&vmg))
+ goto out;
+ else if (vmg_nomem(&vmg))
+ goto unacct_fail;
+ }
+
+ if (vma)
+ vma_iter_next_range(vmi);
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto unacct_fail;
+
+ vma_set_anonymous(vma);
+ vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
+ vm_flags_init(vma, flags);
+ vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_start_write(vma);
+ if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
+ validate_mm(mm);
+ ksm_add_vma(vma);
+out:
+ perf_event_mmap(vma);
+ mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
+ if (flags & VM_LOCKED)
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+unacct_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+}
+
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ unsigned long length, gap;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask + info->start_gap;
+ if (length < info->length)
+ return -ENOMEM;
+
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
+ return -ENOMEM;
+
+ /*
+ * Adjust for the gap first so it doesn't interfere with the
+ * later alignment. The first step is the minimum needed to
+ * fulill the start gap, the next steps is the minimum to align
+ * that. It is the minimum needed to fulill both.
+ */
+ gap = vma_iter_addr(&vmi) + info->start_gap;
+ gap += (info->align_offset - gap) & info->align_mask;
+ tmp = vma_next(&vmi);
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap + length - 1) {
+ low_limit = tmp->vm_end;
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ } else {
+ tmp = vma_prev(&vmi);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ low_limit = vm_end_gap(tmp);
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ }
+
+ return gap;
+}
+
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range [low_limit -
+ * high_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ unsigned long length, gap, gap_end;
+ unsigned long low_limit, high_limit;
+ struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask + info->start_gap;
+ if (length < info->length)
+ return -ENOMEM;
+
+ low_limit = info->low_limit;
+ if (low_limit < mmap_min_addr)
+ low_limit = mmap_min_addr;
+ high_limit = info->high_limit;
+retry:
+ if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
+ return -ENOMEM;
+
+ gap = vma_iter_end(&vmi) - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ gap_end = vma_iter_end(&vmi);
+ tmp = vma_next(&vmi);
+ if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
+ if (vm_start_gap(tmp) < gap_end) {
+ high_limit = vm_start_gap(tmp);
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ } else {
+ tmp = vma_prev(&vmi);
+ if (tmp && vm_end_gap(tmp) > gap) {
+ high_limit = tmp->vm_start;
+ vma_iter_reset(&vmi);
+ goto retry;
+ }
+ }
+
+ return gap;
+}
+
+/*
+ * Verify that the stack growth is acceptable and
+ * update accounting. This is shared with both the
+ * grow-up and grow-down cases.
+ */
+static int acct_stack_growth(struct vm_area_struct *vma,
+ unsigned long size, unsigned long grow)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long new_start;
+
+ /* address space limit tests */
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
+ return -ENOMEM;
+
+ /* Stack limit test */
+ if (size > rlimit(RLIMIT_STACK))
+ return -ENOMEM;
+
+ /* mlock limit tests */
+ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ return -ENOMEM;
+
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
+ /*
+ * Overcommit.. This must be the final test, as it will
+ * update security statistics.
+ */
+ if (security_vm_enough_memory_mm(mm, grow))
+ return -ENOMEM;
+
+ return 0;
+}
+
+#if defined(CONFIG_STACK_GROWSUP)
+/*
+ * PA-RISC uses this for its stack.
+ * vma is the last one with address > vma->vm_end. Have to extend vma.
+ */
+int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *next;
+ unsigned long gap_addr;
+ int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
+
+ if (!(vma->vm_flags & VM_GROWSUP))
+ return -EFAULT;
+
+ mmap_assert_write_locked(mm);
+
+ /* Guard against exceeding limits of the address space. */
+ address &= PAGE_MASK;
+ if (address >= (TASK_SIZE & PAGE_MASK))
+ return -ENOMEM;
+ address += PAGE_SIZE;
+
+ /* Enforce stack_guard_gap */
+ gap_addr = address + stack_guard_gap;
+
+ /* Guard against overflow */
+ if (gap_addr < address || gap_addr > TASK_SIZE)
+ gap_addr = TASK_SIZE;
+
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ /* Check that both stack segments have the same anon_vma? */
+ }
+
+ if (next)
+ vma_iter_prev_range_limit(&vmi, address);
+
+ vma_iter_config(&vmi, vma->vm_start, address);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma))) {
+ vma_iter_free(&vmi);
+ return -ENOMEM;
+ }
+
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
+ /* We update the anon VMA tree. */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address > vma->vm_end) {
+ unsigned long size, grow;
+
+ size = address - vma->vm_start;
+ grow = (address - vma->vm_end) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_iter_store(&vmi, vma);
+ anon_vma_interval_tree_post_update_vma(vma);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ vma_iter_free(&vmi);
+ validate_mm(mm);
+ return error;
+}
+#endif /* CONFIG_STACK_GROWSUP */
+
+/*
+ * vma is the first one with address < vma->vm_start. Have to extend vma.
+ * mmap_lock held for writing.
+ */
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *prev;
+ int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);
+
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return -EFAULT;
+
+ mmap_assert_write_locked(mm);
+
+ address &= PAGE_MASK;
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
+ return -EPERM;
+
+ /* Enforce stack_guard_gap */
+ prev = vma_prev(&vmi);
+ /* Check that both stack segments have the same anon_vma? */
+ if (prev) {
+ if (!(prev->vm_flags & VM_GROWSDOWN) &&
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
+ return -ENOMEM;
+ }
+
+ if (prev)
+ vma_iter_next_range_limit(&vmi, vma->vm_start);
+
+ vma_iter_config(&vmi, address, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma))) {
+ vma_iter_free(&vmi);
+ return -ENOMEM;
+ }
+
+ /* Lock the VMA before expanding to prevent concurrent page faults */
+ vma_start_write(vma);
+ /* We update the anon VMA tree. */
+ anon_vma_lock_write(vma->anon_vma);
+
+ /* Somebody else might have raced and expanded it already */
+ if (address < vma->vm_start) {
+ unsigned long size, grow;
+
+ size = vma->vm_end - address;
+ grow = (vma->vm_start - address) >> PAGE_SHIFT;
+
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, grow);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_iter_store(&vmi, vma);
+ anon_vma_interval_tree_post_update_vma(vma);
+
+ perf_event_mmap(vma);
+ }
+ }
+ }
+ anon_vma_unlock_write(vma->anon_vma);
+ vma_iter_free(&vmi);
+ validate_mm(mm);
+ return error;
+}
+
+int __vm_munmap(unsigned long start, size_t len, bool unlock)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+ LIST_HEAD(uf);
+ VMA_ITERATOR(vmi, mm, start);
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
+ if (ret || !unlock)
+ mmap_write_unlock(mm);
+
+ userfaultfd_unmap_complete(mm, &uf);
+ return ret;
+}
diff --git a/mm/vma.h b/mm/vma.h
index 388d34748674..a2e8710b8c47 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -139,15 +139,10 @@ void validate_mm(struct mm_struct *mm);
#define validate_mm(mm) do { } while (0)
#endif
-/* Required for expand_downwards(). */
-void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
-
-/* Required for expand_downwards(). */
-void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
-
-int vma_expand(struct vma_merge_struct *vmg);
-int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
+__must_check int vma_expand(struct vma_merge_struct *vmg);
+__must_check int vma_shrink(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff);
static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
struct vm_area_struct *vma, gfp_t gfp)
@@ -180,13 +175,14 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
/* We are about to modify the VMA's flags. */
-struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+__must_check struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags);
/* We are about to modify the VMA's flags and/or anon_name. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
@@ -196,7 +192,7 @@ struct vm_area_struct
struct anon_vma_name *new_name);
/* We are about to modify the VMA's memory policy. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
@@ -204,7 +200,7 @@ struct vm_area_struct
struct mempolicy *new_pol);
/* We are about to modify the VMA's flags and/or uffd context. */
-struct vm_area_struct
+__must_check struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
@@ -212,11 +208,13 @@ struct vm_area_struct
unsigned long new_flags,
struct vm_userfaultfd_ctx new_ctx);
-struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
+__must_check struct vm_area_struct
+*vma_merge_new_range(struct vma_merge_struct *vmg);
-struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta);
+__must_check struct vm_area_struct
+*vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta);
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
@@ -243,10 +241,16 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);
-unsigned long __mmap_region(struct file *file, unsigned long addr,
+unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf);
+int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
+
+unsigned long unmapped_area(struct vm_unmapped_area_info *info);
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
+
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
/*
@@ -472,4 +476,12 @@ static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
#endif
+#if defined(CONFIG_STACK_GROWSUP)
+int expand_upwards(struct vm_area_struct *vma, unsigned long address);
+#endif
+
+int expand_downwards(struct vm_area_struct *vma, unsigned long address);
+
+int __vm_munmap(unsigned long start, size_t len, bool unlock);
+
#endif /* __MM_VMA_H */
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
index fc5f172a36bd..2f05735ff190 100644
--- a/mm/vma_internal.h
+++ b/mm/vma_internal.h
@@ -35,6 +35,7 @@
#include <linux/mutex.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
+#include <linux/personality.h>
#include <linux/pfn.h>
#include <linux/rcupdate.h>
#include <linux/rmap.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5c88d0e90c20..61981ee1c9d2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
@@ -3562,11 +3562,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy_noprof(gfp,
+ nr = alloc_pages_bulk_mempolicy_noprof(gfp,
nr_pages_request,
pages + nr_allocated);
else
- nr = alloc_pages_bulk_array_node_noprof(gfp, nid,
+ nr = alloc_pages_bulk_node_noprof(gfp, nid,
nr_pages_request,
pages + nr_allocated);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1ec5ece067e..c767d71c43d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -862,6 +862,31 @@ enum folio_references {
FOLIOREF_ACTIVATE,
};
+#ifdef CONFIG_LRU_GEN
+/*
+ * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
+ * needs to be done by taking the folio off the LRU list and then adding it back
+ * with PG_active set. In contrast, the aging (page table walk) path uses
+ * folio_update_gen().
+ */
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return false;
+ }
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ return true;
+}
+#else
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
@@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio,
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
- referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
@@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio,
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
+ if (lru_gen_enabled()) {
+ if (!referenced_ptes)
+ return FOLIOREF_RECLAIM;
+
+ return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
+ }
+
+ referenced_folio = folio_test_clear_referenced(folio);
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -1053,7 +1086,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
@@ -1092,11 +1125,6 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- /* folio_update_gen() tried to promote this page? */
- if (lru_gen_enabled() && !ignore_references &&
- folio_mapped(folio) && folio_test_referenced(folio))
- goto keep_locked;
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1522,8 +1550,9 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
- nr_reclaimed += stat->nr_demoted;
+ nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += nr_demoted;
+ stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -1664,6 +1693,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
+ unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
total_scan = 0;
@@ -1678,9 +1708,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ /* Using max_nr_skipped to prevent hard LOCKUP*/
+ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
+ (folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
+ max_nr_skipped++;
goto move;
}
@@ -2619,11 +2652,17 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+#define evictable_min_seq(min_seq, swappiness) \
+ min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
+
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define for_each_evictable_type(type, swappiness) \
+ for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
+
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2669,10 +2708,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_folio */
- return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ int n = get_nr_gens(lruvec, type);
+
+ if (n < MIN_NR_GENS || n > MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
}
/******************************************************************************
@@ -3073,16 +3118,20 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
+ int i;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- pos->refaulted = lrugen->avg_refaulted[type][tier] +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- pos->total = lrugen->avg_total[type][tier] +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
+ pos->refaulted = pos->total = 0;
+
+ for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ pos->refaulted += lrugen->avg_refaulted[type][i] +
+ atomic_long_read(&lrugen->refaulted[hist][type][i]);
+ pos->total += lrugen->avg_total[type][i] +
+ lrugen->protected[hist][type][i] +
+ atomic_long_read(&lrugen->evicted[hist][type][i]);
+ }
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
@@ -3108,17 +3157,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
+ lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- if (tier)
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
}
}
}
@@ -3145,16 +3192,19 @@ static int folio_update_gen(struct folio *folio, int gen)
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return -1;
+ }
+
do {
/* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
+ if (!(old_flags & LRU_GEN_MASK))
+ return -1;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
+ new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
@@ -3178,7 +3228,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_gen = (old_gen + 1) % MAX_NR_GENS;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
/* for folio_end_writeback() */
if (reclaiming)
@@ -3253,7 +3303,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (vma_is_anonymous(vma))
- return !walk->can_swap;
+ return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
return true;
@@ -3263,7 +3313,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (shmem_mapping(mapping))
- return !walk->can_swap;
+ return !walk->swappiness;
+
+ if (walk->swappiness > MAX_SWAPPINESS)
+ return true;
/* to exclude special mappings like dax, etc. */
return !mapping->a_ops->read_folio;
@@ -3351,19 +3404,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
}
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
+ struct pglist_data *pgdat)
{
- struct folio *folio;
+ struct folio *folio = pfn_folio(pfn);
- folio = pfn_folio(pfn);
- if (folio_nid(folio) != pgdat->node_id)
+ if (folio_lru_gen(folio) < 0)
return NULL;
- if (folio_memcg(folio) != memcg)
+ if (folio_nid(folio) != pgdat->node_id)
return NULL;
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
+ if (folio_memcg(folio) != memcg)
return NULL;
return folio;
@@ -3377,29 +3428,55 @@ static bool suitable_to_scan(int total, int young)
return young * n >= total;
}
+static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int new_gen, bool dirty)
+{
+ int old_gen;
+
+ if (!folio)
+ return;
+
+ if (dirty && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ } else if (lru_gen_set_refs(folio)) {
+ old_gen = folio_lru_gen(folio);
+ if (old_gen >= 0 && old_gen != new_gen)
+ folio_activate(folio);
+ }
+}
+
static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
+ bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
int total = 0;
int young = 0;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
pmd_t pmdval;
- pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
- &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
if (!pte)
return false;
+
if (!spin_trylock(ptl)) {
pte_unmap(pte);
- return false;
+ return true;
}
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
@@ -3421,26 +3498,30 @@ restart:
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
continue;
- young++;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
+
+ last = folio;
+ dirty = false;
+ }
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ if (pte_dirty(ptent))
+ dirty = true;
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
}
+ walk_update_folio(walk, last, gen, dirty);
+ last = NULL;
+
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
@@ -3454,13 +3535,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
+ bool dirty;
pmd_t *pmd;
spinlock_t *ptl;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3506,27 +3589,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (pfn == -1)
goto next;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
goto next;
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pmd_dirty(pmd[i]))
+ dirty = true;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
@@ -3718,22 +3804,25 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
int zone;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type == LRU_GEN_ANON && !can_swap)
+ if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
goto done;
- /* prevent cold/hot inversion if force_scan is true */
+ /* prevent cold/hot inversion if the type is evictable */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -3743,6 +3832,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
new_gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int tier = lru_tier_from_refs(refs, workingset);
+ int delta = folio_nr_pages(folio);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
+
if (!--remaining)
return false;
}
@@ -3754,7 +3852,7 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
@@ -3764,7 +3862,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
/* find the oldest populated generation */
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
@@ -3780,13 +3878,17 @@ next:
}
/* see the comment on lru_gen_folio */
- if (can_swap) {
- min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ if (swappiness && swappiness <= MAX_SWAPPINESS) {
+ unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
+
+ if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
+ min_seq[LRU_GEN_ANON] = seq;
+ else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
+ min_seq[LRU_GEN_FILE] = seq;
}
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
- if (min_seq[type] == lrugen->min_seq[type])
+ for_each_evictable_type(type, swappiness) {
+ if (min_seq[type] <= lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
@@ -3797,8 +3899,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
bool success;
int prev, next;
@@ -3816,13 +3917,11 @@ restart:
if (!success)
goto unlock;
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
-
- if (inc_min_seq(lruvec, type, can_swap))
+ if (inc_min_seq(lruvec, type, swappiness))
continue;
spin_unlock_irq(&lruvec->lru_lock);
@@ -3866,7 +3965,7 @@ unlock:
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3877,7 +3976,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */
if (seq <= READ_ONCE(mm_state->seq))
@@ -3902,7 +4001,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
walk->lruvec = lruvec;
walk->seq = seq;
- walk->can_swap = can_swap;
+ walk->swappiness = swappiness;
walk->force_scan = force_scan;
do {
@@ -3912,7 +4011,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
@@ -3953,13 +4052,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
unsigned long total = 0;
- bool can_swap = get_swappiness(lruvec, sc);
+ int swappiness = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
@@ -3979,6 +4078,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
{
int gen;
unsigned long birth;
+ int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
@@ -3988,8 +4088,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
if (!lruvec_is_sizable(lruvec, sc))
return false;
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return time_is_before_jiffies(birth + min_ttl);
@@ -4048,21 +4147,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
+ bool dirty;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
+ struct folio *last = NULL;
int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4109,37 +4209,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(vma, addr, pte + i))
continue;
- young++;
-
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
-
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- continue;
+ last = folio;
+ dirty = false;
}
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen) {
- folio_clear_lru_refs(folio);
- folio_activate(folio);
- }
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
}
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
/* feedback from rmap walkers to page table walkers */
@@ -4297,7 +4388,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4319,14 +4411,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-
+ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- lrugen->protected[hist][type][tier - 1] + delta);
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
return true;
}
@@ -4346,8 +4441,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* waiting for writeback */
- if (folio_test_locked(folio) || writeback ||
- (type == LRU_GEN_FILE && dirty)) {
+ if (writeback || (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4376,13 +4470,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return false;
}
- /* see the comment on MAX_NR_TIERS */
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- folio_clear_lru_refs(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
- folio_clear_referenced(folio);
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4478,13 +4571,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
struct ctrl_pos sp, pv;
/*
- * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * To leave a margin for fluctuations, use a larger gain factor (2:3).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
- read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ read_ctrl_pos(lruvec, type, 0, 2, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ read_ctrl_pos(lruvec, type, tier, 3, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
@@ -4492,79 +4585,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
return tier - 1;
}
-static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
+ if (swappiness <= MIN_SWAPPINESS + 1)
+ return LRU_GEN_FILE;
+
+ if (swappiness >= MAX_SWAPPINESS)
+ return LRU_GEN_ANON;
/*
- * Compare the first tier of anon with that of file to determine which
- * type to scan. Also need to compare other tiers of the selected type
- * with the first tier of the other type to determine the last tier (of
- * the selected type) to evict.
+ * Compare the sum of all tiers of anon with that of file to determine
+ * which type to scan.
*/
- read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- type = positive_ctrl_err(&sp, &pv);
-
- read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- if (!positive_ctrl_err(&sp, &pv))
- break;
- }
-
- *tier_idx = tier - 1;
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
- return type;
+ return positive_ctrl_err(&sp, &pv);
}
static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
- int type;
- int scanned;
- int tier = -1;
- DEFINE_MIN_SEQ(lruvec);
+ int type = get_type_to_scan(lruvec, swappiness);
- /*
- * Try to make the obvious choice first, and if anon and file are both
- * available from the same generation,
- * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
- * first.
- * 2. If !__GFP_IO, file first since clean pagecache is more likely to
- * exist than clean swapcache.
- */
- if (!swappiness)
- type = LRU_GEN_FILE;
- else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- type = LRU_GEN_ANON;
- else if (swappiness == 1)
- type = LRU_GEN_FILE;
- else if (swappiness == MAX_SWAPPINESS)
- type = LRU_GEN_ANON;
- else if (!(sc->gfp_mask & __GFP_IO))
- type = LRU_GEN_FILE;
- else
- type = get_type_to_scan(lruvec, swappiness, &tier);
+ for_each_evictable_type(i, swappiness) {
+ int scanned;
+ int tier = get_tier_idx(lruvec, type);
- for (i = !swappiness; i < ANON_AND_FILE; i++) {
- if (tier < 0)
- tier = get_tier_idx(lruvec, type);
+ *type_scanned = type;
scanned = scan_folios(lruvec, sc, type, tier, list);
if (scanned)
- break;
+ return scanned;
type = !type;
- tier = -1;
}
- *type_scanned = type;
-
- return scanned;
+ return 0;
}
static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
@@ -4580,6 +4639,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
bool skip_retry = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -4589,7 +4649,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
scanned += try_to_inc_min_seq(lruvec, swappiness);
- if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
@@ -4605,31 +4665,24 @@ retry:
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ DEFINE_MIN_SEQ(lruvec);
+
if (!folio_evictable(folio)) {
list_del(&folio->lru);
folio_putback_lru(folio);
continue;
}
- if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
- continue;
- }
-
- if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
- folio_mapped(folio) || folio_test_locked(folio) ||
- folio_test_dirty(folio) || folio_test_writeback(folio)) {
- /* don't add rejected folios to the oldest generation */
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
- BIT(PG_active));
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, &clean);
continue;
}
- /* retry folios that may have missed folio_rotate_reclaimable() */
- list_move(&folio->lru, &clean);
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4664,63 +4717,32 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- bool can_swap, unsigned long *nr_to_scan)
+ int swappiness, unsigned long *nr_to_scan)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
- unsigned long total = 0;
+ unsigned long size = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- /* whether this lruvec is completely out of cold folios */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
- *nr_to_scan = 0;
+ *nr_to_scan = 0;
+ /* have to run aging, since eviction is not possible anymore */
+ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- }
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
}
}
- *nr_to_scan = total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ *nr_to_scan = size;
+ /* better to run aging even though eviction is still possible */
+ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/*
@@ -4728,7 +4750,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
bool success;
unsigned long nr_to_scan;
@@ -4738,7 +4760,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
+ success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
/* try to scrape all its memory if this memcg was deleted */
if (nr_to_scan && !mem_cgroup_online(memcg))
@@ -4749,7 +4771,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return nr_to_scan >> sc->priority;
/* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -5293,8 +5315,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
}
for (i = 0; i < 3; i++)
@@ -5349,7 +5370,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, " node %5d\n", nid);
if (!full)
- seq = min_seq[LRU_GEN_ANON];
+ seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
else if (max_seq >= MAX_NR_GENS)
seq = max_seq - MAX_NR_GENS + 1;
else
@@ -5389,23 +5410,14 @@ static const struct seq_operations lru_gen_seq_ops = {
};
static int run_aging(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
-
- if (seq < max_seq)
- return 0;
if (seq > max_seq)
return -EINVAL;
- if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
- return -ERANGE;
-
- try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
-
- return 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
@@ -5421,7 +5433,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
while (!signal_pending(current)) {
DEFINE_MIN_SEQ(lruvec);
- if (seq < min_seq[!swappiness])
+ if (seq < evictable_min_seq(min_seq, swappiness))
return 0;
if (sc->nr_reclaimed >= nr_to_reclaim)
@@ -5466,7 +5478,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS)
+ else if (swappiness > MAX_SWAPPINESS + 1)
goto done;
switch (cmd) {
@@ -7192,10 +7204,6 @@ static int kswapd(void *p)
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
@@ -7364,13 +7372,15 @@ void __meminit kswapd_run(int nid)
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
+ } else {
+ wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 16bfe1c694dd..88998725f1c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1435,6 +1435,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
+ "direct_map_level2_collapses",
+ "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
"vma_lock_success",
diff --git a/mm/workingset.c b/mm/workingset.c
index a4705e196545..4841ae8af411 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio)
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
@@ -253,18 +254,18 @@ static void *lru_gen_eviction(struct folio *folio)
hist = lru_hist_from_seq(min_seq);
atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
}
/*
* Tests if the shadow entry is for a folio that was recently evicted.
* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
*/
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
int memcg_id;
- unsigned long min_seq;
+ unsigned long max_seq;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
@@ -273,8 +274,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
memcg = mem_cgroup_from_id(memcg_id);
*lruvec = mem_cgroup_lruvec(memcg, pgdat);
- min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
- return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+ max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
+ max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
+
+ return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}
static void lru_gen_refault(struct folio *folio, void *shadow)
@@ -290,7 +293,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
rcu_read_lock();
- recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
if (lruvec != folio_lruvec(folio))
goto unlock;
@@ -302,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
lrugen = &lruvec->lrugen;
hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
- /* see the comment in folio_lru_refs() */
- refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
- tier = lru_tier_from_refs(refs);
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
+ tier = lru_tier_from_refs(refs, workingset);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
- /*
- * Count the following two cases as stalls:
- * 1. For pages accessed through page tables, hotter pages pushed out
- * hot pages which refaulted immediately.
- * 2. For pages accessed multiple times through file descriptors,
- * they would have been protected by sort_folio().
- */
- if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
- set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
+ /* see folio_add_lru() where folio_set_active() will be called */
+ if (lru_gen_in_fault())
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+
+ if (workingset) {
+ folio_set_workingset(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
- }
+ } else
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
rcu_read_unlock();
}
@@ -331,7 +330,7 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
-static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
unsigned long *token, bool *workingset)
{
return false;
@@ -428,17 +427,16 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
struct pglist_data *pgdat;
unsigned long eviction;
- rcu_read_lock();
-
if (lru_gen_enabled()) {
- bool recent = lru_gen_test_recent(shadow, file,
- &eviction_lruvec, &eviction, workingset);
+ bool recent;
+ rcu_read_lock();
+ recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
rcu_read_unlock();
return recent;
}
-
+ rcu_read_lock();
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
@@ -459,14 +457,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
* configurations instead.
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() &&
- (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) {
- rcu_read_unlock();
- return false;
- }
-
+ if (!mem_cgroup_tryget(eviction_memcg))
+ eviction_memcg = NULL;
rcu_read_unlock();
+ if (!mem_cgroup_disabled() && !eviction_memcg)
+ return false;
/*
* Flush stats (and potentially sleep) outside the RCU read section.
*
@@ -544,6 +540,8 @@ void workingset_refault(struct folio *folio, void *shadow)
bool workingset;
long nr;
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
if (lru_gen_enabled()) {
lru_gen_refault(folio, shadow);
return;
@@ -558,7 +556,6 @@ void workingset_refault(struct folio *folio, void *shadow)
* is actually experiencing the refault event. Make sure the folio is
* locked to guarantee folio_memcg() stability throughout.
*/
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
pgdat = folio_pgdat(folio);
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
new file mode 100644
index 000000000000..fa47fece2237
--- /dev/null
+++ b/mm/zpdesc.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* zpdesc.h: zswap.zpool memory descriptor
+ *
+ * Written by Alex Shi <alexs@kernel.org>
+ * Hyeonggon Yoo <42.hyeyoo@gmail.com>
+ */
+#ifndef __MM_ZPDESC_H__
+#define __MM_ZPDESC_H__
+
+/*
+ * struct zpdesc - Memory descriptor for zpool memory.
+ * @flags: Page flags, mostly unused by zsmalloc.
+ * @lru: Indirectly used by page migration.
+ * @movable_ops: Used by page migration.
+ * @next: Next zpdesc in a zspage in zsmalloc zpool.
+ * @handle: For huge zspage in zsmalloc zpool.
+ * @zspage: Points to the zspage this zpdesc is a part of.
+ * @first_obj_offset: First object offset in zsmalloc zpool.
+ * @_refcount: The number of references to this zpdesc.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues. In particular, do not expand into the overlap
+ * with memcg_data.
+ *
+ * Page flags used:
+ * * PG_private identifies the first component page.
+ * * PG_locked is used by page migration code.
+ */
+struct zpdesc {
+ unsigned long flags;
+ struct list_head lru;
+ unsigned long movable_ops;
+ union {
+ struct zpdesc *next;
+ unsigned long handle;
+ };
+ struct zspage *zspage;
+ /*
+ * Only the lower 24 bits are available for offset, limiting a page
+ * to 16 MiB. The upper 8 bits are reserved for PGTY_zsmalloc.
+ *
+ * Do not access this field directly.
+ * Instead, use {get,set}_first_obj_offset() helpers.
+ */
+ unsigned int first_obj_offset;
+ atomic_t _refcount;
+};
+#define ZPDESC_MATCH(pg, zp) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct zpdesc, zp))
+
+ZPDESC_MATCH(flags, flags);
+ZPDESC_MATCH(lru, lru);
+ZPDESC_MATCH(mapping, movable_ops);
+ZPDESC_MATCH(index, next);
+ZPDESC_MATCH(index, handle);
+ZPDESC_MATCH(private, zspage);
+ZPDESC_MATCH(page_type, first_obj_offset);
+ZPDESC_MATCH(_refcount, _refcount);
+#undef ZPDESC_MATCH
+static_assert(sizeof(struct zpdesc) <= sizeof(struct page));
+
+/*
+ * zpdesc_page - The first struct page allocated for a zpdesc
+ * @zp: The zpdesc.
+ *
+ * A convenience wrapper for converting zpdesc to the first struct page of the
+ * underlying folio, to communicate with code not yet converted to folio or
+ * struct zpdesc.
+ *
+ */
+#define zpdesc_page(zp) (_Generic((zp), \
+ const struct zpdesc *: (const struct page *)(zp), \
+ struct zpdesc *: (struct page *)(zp)))
+
+/**
+ * zpdesc_folio - The folio allocated for a zpdesc
+ * @zp: The zpdesc.
+ *
+ * Zpdescs are descriptors for zpool memory. The zpool memory itself is
+ * allocated as folios that contain the zpool objects, and zpdesc uses specific
+ * fields in the first struct page of the folio - those fields are now accessed
+ * by struct zpdesc.
+ *
+ * It is occasionally necessary convert to back to a folio in order to
+ * communicate with the rest of the mm. Please use this helper function
+ * instead of casting yourself, as the implementation may change in the future.
+ */
+#define zpdesc_folio(zp) (_Generic((zp), \
+ const struct zpdesc *: (const struct folio *)(zp), \
+ struct zpdesc *: (struct folio *)(zp)))
+/**
+ * page_zpdesc - Converts from first struct page to zpdesc.
+ * @p: The first (either head of compound or single) page of zpdesc.
+ *
+ * A temporary wrapper to convert struct page to struct zpdesc in situations
+ * where we know the page is the compound head, or single order-0 page.
+ *
+ * Long-term ideally everything would work with struct zpdesc directly or go
+ * through folio to struct zpdesc.
+ *
+ * Return: The zpdesc which contains this page
+ */
+#define page_zpdesc(p) (_Generic((p), \
+ const struct page *: (const struct zpdesc *)(p), \
+ struct page *: (struct zpdesc *)(p)))
+
+static inline void zpdesc_lock(struct zpdesc *zpdesc)
+{
+ folio_lock(zpdesc_folio(zpdesc));
+}
+
+static inline bool zpdesc_trylock(struct zpdesc *zpdesc)
+{
+ return folio_trylock(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_unlock(struct zpdesc *zpdesc)
+{
+ folio_unlock(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_wait_locked(struct zpdesc *zpdesc)
+{
+ folio_wait_locked(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_get(struct zpdesc *zpdesc)
+{
+ folio_get(zpdesc_folio(zpdesc));
+}
+
+static inline void zpdesc_put(struct zpdesc *zpdesc)
+{
+ folio_put(zpdesc_folio(zpdesc));
+}
+
+static inline void *kmap_local_zpdesc(struct zpdesc *zpdesc)
+{
+ return kmap_local_page(zpdesc_page(zpdesc));
+}
+
+static inline unsigned long zpdesc_pfn(struct zpdesc *zpdesc)
+{
+ return page_to_pfn(zpdesc_page(zpdesc));
+}
+
+static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
+{
+ return page_zpdesc(pfn_to_page(pfn));
+}
+
+static inline void __zpdesc_set_movable(struct zpdesc *zpdesc,
+ const struct movable_operations *mops)
+{
+ __SetPageMovable(zpdesc_page(zpdesc), mops);
+}
+
+static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
+{
+ __SetPageZsmalloc(zpdesc_page(zpdesc));
+}
+
+static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc)
+{
+ __ClearPageZsmalloc(zpdesc_page(zpdesc));
+}
+
+static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc)
+{
+ return PageIsolated(zpdesc_page(zpdesc));
+}
+
+static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc)
+{
+ return page_zone(zpdesc_page(zpdesc));
+}
+
+static inline bool zpdesc_is_locked(struct zpdesc *zpdesc)
+{
+ return folio_test_locked(zpdesc_folio(zpdesc));
+}
+#endif
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 64b66a4d3e6e..6d0e47f7ae33 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -13,24 +13,6 @@
* Released under the terms of GNU General Public License Version 2.0
*/
-/*
- * Following is how we use various fields and flags of underlying
- * struct page(s) to form a zspage.
- *
- * Usage of struct page fields:
- * page->private: points to zspage
- * page->index: links together all component pages of a zspage
- * For the huge page, this is always 0, so we use this field
- * to store handle.
- * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object
- * offset in a subpage of a zspage
- *
- * Usage of struct page flags:
- * PG_private: identifies the first component page
- * PG_owner_priv_1: identifies the huge component page
- *
- */
-
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/*
@@ -67,6 +49,7 @@
#include <linux/pagemap.h>
#include <linux/fs.h>
#include <linux/local_lock.h>
+#include "zpdesc.h"
#define ZSPAGE_MAGIC 0x58
@@ -245,6 +228,35 @@ struct zs_pool {
atomic_t compaction_in_progress;
};
+static inline void zpdesc_set_first(struct zpdesc *zpdesc)
+{
+ SetPagePrivate(zpdesc_page(zpdesc));
+}
+
+static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc)
+{
+ inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
+}
+
+static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
+{
+ dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
+}
+
+static inline struct zpdesc *alloc_zpdesc(gfp_t gfp)
+{
+ struct page *page = alloc_page(gfp);
+
+ return page_zpdesc(page);
+}
+
+static inline void free_zpdesc(struct zpdesc *zpdesc)
+{
+ struct page *page = zpdesc_page(zpdesc);
+
+ __free_page(page);
+}
+
struct zspage {
struct {
unsigned int huge:HUGE_BITS;
@@ -254,7 +266,7 @@ struct zspage {
};
unsigned int inuse;
unsigned int freeobj;
- struct page *first_page;
+ struct zpdesc *first_zpdesc;
struct list_head list; /* fullness list */
struct zs_pool *pool;
rwlock_t lock;
@@ -440,9 +452,9 @@ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
.lock = INIT_LOCAL_LOCK(lock),
};
-static __maybe_unused int is_first_page(struct page *page)
+static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
{
- return PagePrivate(page);
+ return PagePrivate(zpdesc_page(zpdesc));
}
/* Protected by class->lock */
@@ -451,36 +463,35 @@ static inline int get_zspage_inuse(struct zspage *zspage)
return zspage->inuse;
}
-
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
{
zspage->inuse += val;
}
-static inline struct page *get_first_page(struct zspage *zspage)
+static struct zpdesc *get_first_zpdesc(struct zspage *zspage)
{
- struct page *first_page = zspage->first_page;
+ struct zpdesc *first_zpdesc = zspage->first_zpdesc;
- VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
- return first_page;
+ VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc));
+ return first_zpdesc;
}
#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff
-static inline unsigned int get_first_obj_offset(struct page *page)
+static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc)
{
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
- return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK;
+ VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
+ return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
-static inline void set_first_obj_offset(struct page *page, unsigned int offset)
+static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset)
{
/* With 24 bits available, we can support offsets into 16 MiB pages. */
BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
+ VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
- page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK;
- page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
+ zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK;
+ zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
}
static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -733,52 +744,52 @@ out:
return newfg;
}
-static struct zspage *get_zspage(struct page *page)
+static struct zspage *get_zspage(struct zpdesc *zpdesc)
{
- struct zspage *zspage = (struct zspage *)page_private(page);
+ struct zspage *zspage = zpdesc->zspage;
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
return zspage;
}
-static struct page *get_next_page(struct page *page)
+static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc)
{
- struct zspage *zspage = get_zspage(page);
+ struct zspage *zspage = get_zspage(zpdesc);
if (unlikely(ZsHugePage(zspage)))
return NULL;
- return (struct page *)page->index;
+ return zpdesc->next;
}
/**
- * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value
* @obj: the encoded object value
- * @page: page object resides in zspage
+ * @zpdesc: zpdesc object resides in zspage
* @obj_idx: object index
*/
-static void obj_to_location(unsigned long obj, struct page **page,
+static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc,
unsigned int *obj_idx)
{
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
*obj_idx = (obj & OBJ_INDEX_MASK);
}
-static void obj_to_page(unsigned long obj, struct page **page)
+static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc)
{
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
}
/**
- * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
- * @page: page object resides in zspage
+ * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>)
+ * @zpdesc: zpdesc object resides in zspage
* @obj_idx: object index
*/
-static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
+static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx)
{
unsigned long obj;
- obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS;
obj |= obj_idx & OBJ_INDEX_MASK;
return obj;
@@ -789,15 +800,15 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static inline bool obj_allocated(struct page *page, void *obj,
+static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj,
unsigned long *phandle)
{
unsigned long handle;
- struct zspage *zspage = get_zspage(page);
+ struct zspage *zspage = get_zspage(zpdesc);
if (unlikely(ZsHugePage(zspage))) {
- VM_BUG_ON_PAGE(!is_first_page(page), page);
- handle = page->index;
+ VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc));
+ handle = zpdesc->handle;
} else
handle = *(unsigned long *)obj;
@@ -809,22 +820,24 @@ static inline bool obj_allocated(struct page *page, void *obj,
return true;
}
-static void reset_page(struct page *page)
+static void reset_zpdesc(struct zpdesc *zpdesc)
{
+ struct page *page = zpdesc_page(zpdesc);
+
__ClearPageMovable(page);
ClearPagePrivate(page);
- set_page_private(page, 0);
- page->index = 0;
+ zpdesc->zspage = NULL;
+ zpdesc->next = NULL;
__ClearPageZsmalloc(page);
}
static int trylock_zspage(struct zspage *zspage)
{
- struct page *cursor, *fail;
+ struct zpdesc *cursor, *fail;
- for (cursor = get_first_page(zspage); cursor != NULL; cursor =
- get_next_page(cursor)) {
- if (!trylock_page(cursor)) {
+ for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor =
+ get_next_zpdesc(cursor)) {
+ if (!zpdesc_trylock(cursor)) {
fail = cursor;
goto unlock;
}
@@ -832,9 +845,9 @@ static int trylock_zspage(struct zspage *zspage)
return 1;
unlock:
- for (cursor = get_first_page(zspage); cursor != fail; cursor =
- get_next_page(cursor))
- unlock_page(cursor);
+ for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor =
+ get_next_zpdesc(cursor))
+ zpdesc_unlock(cursor);
return 0;
}
@@ -842,23 +855,23 @@ unlock:
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
- struct page *page, *next;
+ struct zpdesc *zpdesc, *next;
assert_spin_locked(&class->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
- next = page = get_first_page(zspage);
+ next = zpdesc = get_first_zpdesc(zspage);
do {
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- next = get_next_page(page);
- reset_page(page);
- unlock_page(page);
- dec_zone_page_state(page, NR_ZSPAGES);
- put_page(page);
- page = next;
- } while (page != NULL);
+ VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc));
+ next = get_next_zpdesc(zpdesc);
+ reset_zpdesc(zpdesc);
+ zpdesc_unlock(zpdesc);
+ zpdesc_dec_zone_page_state(zpdesc);
+ zpdesc_put(zpdesc);
+ zpdesc = next;
+ } while (zpdesc != NULL);
cache_free_zspage(pool, zspage);
@@ -891,16 +904,16 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
{
unsigned int freeobj = 1;
unsigned long off = 0;
- struct page *page = get_first_page(zspage);
+ struct zpdesc *zpdesc = get_first_zpdesc(zspage);
- while (page) {
- struct page *next_page;
+ while (zpdesc) {
+ struct zpdesc *next_zpdesc;
struct link_free *link;
void *vaddr;
- set_first_obj_offset(page, off);
+ set_first_obj_offset(zpdesc, off);
- vaddr = kmap_local_page(page);
+ vaddr = kmap_local_zpdesc(zpdesc);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
@@ -913,8 +926,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
* page, which must point to the first object on the next
* page (if present)
*/
- next_page = get_next_page(page);
- if (next_page) {
+ next_zpdesc = get_next_zpdesc(zpdesc);
+ if (next_zpdesc) {
link->next = freeobj++ << OBJ_TAG_BITS;
} else {
/*
@@ -924,7 +937,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
link->next = -1UL << OBJ_TAG_BITS;
}
kunmap_local(vaddr);
- page = next_page;
+ zpdesc = next_zpdesc;
off %= PAGE_SIZE;
}
@@ -932,35 +945,35 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
}
static void create_page_chain(struct size_class *class, struct zspage *zspage,
- struct page *pages[])
+ struct zpdesc *zpdescs[])
{
int i;
- struct page *page;
- struct page *prev_page = NULL;
- int nr_pages = class->pages_per_zspage;
+ struct zpdesc *zpdesc;
+ struct zpdesc *prev_zpdesc = NULL;
+ int nr_zpdescs = class->pages_per_zspage;
/*
* Allocate individual pages and link them together as:
- * 1. all pages are linked together using page->index
- * 2. each sub-page point to zspage using page->private
+ * 1. all pages are linked together using zpdesc->next
+ * 2. each sub-page point to zspage using zpdesc->zspage
*
- * we set PG_private to identify the first page (i.e. no other sub-page
+ * we set PG_private to identify the first zpdesc (i.e. no other zpdesc
* has this flag set).
*/
- for (i = 0; i < nr_pages; i++) {
- page = pages[i];
- set_page_private(page, (unsigned long)zspage);
- page->index = 0;
+ for (i = 0; i < nr_zpdescs; i++) {
+ zpdesc = zpdescs[i];
+ zpdesc->zspage = zspage;
+ zpdesc->next = NULL;
if (i == 0) {
- zspage->first_page = page;
- SetPagePrivate(page);
+ zspage->first_zpdesc = zpdesc;
+ zpdesc_set_first(zpdesc);
if (unlikely(class->objs_per_zspage == 1 &&
class->pages_per_zspage == 1))
SetZsHugePage(zspage);
} else {
- prev_page->index = (unsigned long)page;
+ prev_zpdesc->next = zpdesc;
}
- prev_page = page;
+ prev_zpdesc = zpdesc;
}
}
@@ -972,7 +985,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
gfp_t gfp)
{
int i;
- struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+ struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
struct zspage *zspage = cache_alloc_zspage(pool, gfp);
if (!zspage)
@@ -982,25 +995,25 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
migrate_lock_init(zspage);
for (i = 0; i < class->pages_per_zspage; i++) {
- struct page *page;
+ struct zpdesc *zpdesc;
- page = alloc_page(gfp);
- if (!page) {
+ zpdesc = alloc_zpdesc(gfp);
+ if (!zpdesc) {
while (--i >= 0) {
- dec_zone_page_state(pages[i], NR_ZSPAGES);
- __ClearPageZsmalloc(pages[i]);
- __free_page(pages[i]);
+ zpdesc_dec_zone_page_state(zpdescs[i]);
+ __zpdesc_clear_zsmalloc(zpdescs[i]);
+ free_zpdesc(zpdescs[i]);
}
cache_free_zspage(pool, zspage);
return NULL;
}
- __SetPageZsmalloc(page);
+ __zpdesc_set_zsmalloc(zpdesc);
- inc_zone_page_state(page, NR_ZSPAGES);
- pages[i] = page;
+ zpdesc_inc_zone_page_state(zpdesc);
+ zpdescs[i] = zpdesc;
}
- create_page_chain(class, zspage, pages);
+ create_page_chain(class, zspage, zpdescs);
init_zspage(class, zspage);
zspage->pool = pool;
zspage->class = class->index;
@@ -1044,7 +1057,7 @@ static inline void __zs_cpu_down(struct mapping_area *area)
}
static void *__zs_map_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
+ struct zpdesc *zpdescs[2], int off, int size)
{
size_t sizes[2];
char *buf = area->vm_buf;
@@ -1060,14 +1073,14 @@ static void *__zs_map_object(struct mapping_area *area,
sizes[1] = size - sizes[0];
/* copy object to per-cpu buffer */
- memcpy_from_page(buf, pages[0], off, sizes[0]);
- memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]);
+ memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]);
+ memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]);
out:
return area->vm_buf;
}
static void __zs_unmap_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
+ struct zpdesc *zpdescs[2], int off, int size)
{
size_t sizes[2];
char *buf;
@@ -1085,8 +1098,8 @@ static void __zs_unmap_object(struct mapping_area *area,
sizes[1] = size - sizes[0];
/* copy per-cpu buffer to object */
- memcpy_to_page(pages[0], off, buf, sizes[0]);
- memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]);
+ memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]);
+ memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]);
out:
/* enable page faults to match kunmap_local() return conditions */
@@ -1176,13 +1189,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{
struct zspage *zspage;
- struct page *page;
+ struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
struct size_class *class;
struct mapping_area *area;
- struct page *pages[2];
+ struct zpdesc *zpdescs[2];
void *ret;
/*
@@ -1195,8 +1208,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
/* It guarantees it can get zspage from handle safely */
read_lock(&pool->migrate_lock);
obj = handle_to_obj(handle);
- obj_to_location(obj, &page, &obj_idx);
- zspage = get_zspage(page);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
/*
* migration cannot move any zpages in this zspage. Here, class->lock
@@ -1215,17 +1228,17 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
- area->vm_addr = kmap_local_page(page);
+ area->vm_addr = kmap_local_zpdesc(zpdesc);
ret = area->vm_addr + off;
goto out;
}
/* this object spans two pages */
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ zpdescs[0] = zpdesc;
+ zpdescs[1] = get_next_zpdesc(zpdesc);
+ BUG_ON(!zpdescs[1]);
- ret = __zs_map_object(area, pages, off, class->size);
+ ret = __zs_map_object(area, zpdescs, off, class->size);
out:
if (likely(!ZsHugePage(zspage)))
ret += ZS_HANDLE_SIZE;
@@ -1237,7 +1250,7 @@ EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
struct zspage *zspage;
- struct page *page;
+ struct zpdesc *zpdesc;
unsigned long obj, off;
unsigned int obj_idx;
@@ -1245,8 +1258,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
struct mapping_area *area;
obj = handle_to_obj(handle);
- obj_to_location(obj, &page, &obj_idx);
- zspage = get_zspage(page);
+ obj_to_location(obj, &zpdesc, &obj_idx);
+ zspage = get_zspage(zpdesc);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
@@ -1254,13 +1267,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
if (off + class->size <= PAGE_SIZE)
kunmap_local(area->vm_addr);
else {
- struct page *pages[2];
+ struct zpdesc *zpdescs[2];
- pages[0] = page;
- pages[1] = get_next_page(page);
- BUG_ON(!pages[1]);
+ zpdescs[0] = zpdesc;
+ zpdescs[1] = get_next_zpdesc(zpdesc);
+ BUG_ON(!zpdescs[1]);
- __zs_unmap_object(area, pages, off, class->size);
+ __zs_unmap_object(area, zpdescs, off, class->size);
}
local_unlock(&zs_map_area.lock);
@@ -1290,12 +1303,12 @@ EXPORT_SYMBOL_GPL(zs_huge_class_size);
static unsigned long obj_malloc(struct zs_pool *pool,
struct zspage *zspage, unsigned long handle)
{
- int i, nr_page, offset;
+ int i, nr_zpdesc, offset;
unsigned long obj;
struct link_free *link;
struct size_class *class;
- struct page *m_page;
+ struct zpdesc *m_zpdesc;
unsigned long m_offset;
void *vaddr;
@@ -1303,27 +1316,26 @@ static unsigned long obj_malloc(struct zs_pool *pool,
obj = get_freeobj(zspage);
offset = obj * class->size;
- nr_page = offset >> PAGE_SHIFT;
+ nr_zpdesc = offset >> PAGE_SHIFT;
m_offset = offset_in_page(offset);
- m_page = get_first_page(zspage);
+ m_zpdesc = get_first_zpdesc(zspage);
- for (i = 0; i < nr_page; i++)
- m_page = get_next_page(m_page);
+ for (i = 0; i < nr_zpdesc; i++)
+ m_zpdesc = get_next_zpdesc(m_zpdesc);
- vaddr = kmap_local_page(m_page);
+ vaddr = kmap_local_zpdesc(m_zpdesc);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
if (likely(!ZsHugePage(zspage)))
/* record handle in the header of allocated chunk */
link->handle = handle | OBJ_ALLOCATED_TAG;
else
- /* record handle to page->index */
- zspage->first_page->index = handle | OBJ_ALLOCATED_TAG;
+ zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG;
kunmap_local(vaddr);
mod_zspage_inuse(zspage, 1);
- obj = location_to_obj(m_page, obj);
+ obj = location_to_obj(m_zpdesc, obj);
record_obj(handle, obj);
return obj;
@@ -1402,23 +1414,24 @@ static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
- struct page *f_page;
+ struct zpdesc *f_zpdesc;
unsigned long f_offset;
unsigned int f_objidx;
void *vaddr;
- obj_to_location(obj, &f_page, &f_objidx);
+
+ obj_to_location(obj, &f_zpdesc, &f_objidx);
f_offset = offset_in_page(class_size * f_objidx);
- zspage = get_zspage(f_page);
+ zspage = get_zspage(f_zpdesc);
- vaddr = kmap_local_page(f_page);
+ vaddr = kmap_local_zpdesc(f_zpdesc);
link = (struct link_free *)(vaddr + f_offset);
/* Insert this object in containing zspage's freelist */
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
- f_page->index = 0;
+ f_zpdesc->handle = 0;
set_freeobj(zspage, f_objidx);
kunmap_local(vaddr);
@@ -1428,7 +1441,7 @@ static void obj_free(int class_size, unsigned long obj)
void zs_free(struct zs_pool *pool, unsigned long handle)
{
struct zspage *zspage;
- struct page *f_page;
+ struct zpdesc *f_zpdesc;
unsigned long obj;
struct size_class *class;
int fullness;
@@ -1442,8 +1455,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
*/
read_lock(&pool->migrate_lock);
obj = handle_to_obj(handle);
- obj_to_page(obj, &f_page);
- zspage = get_zspage(f_page);
+ obj_to_zpdesc(obj, &f_zpdesc);
+ zspage = get_zspage(f_zpdesc);
class = zspage_class(pool, zspage);
spin_lock(&class->lock);
read_unlock(&pool->migrate_lock);
@@ -1463,7 +1476,7 @@ EXPORT_SYMBOL_GPL(zs_free);
static void zs_object_copy(struct size_class *class, unsigned long dst,
unsigned long src)
{
- struct page *s_page, *d_page;
+ struct zpdesc *s_zpdesc, *d_zpdesc;
unsigned int s_objidx, d_objidx;
unsigned long s_off, d_off;
void *s_addr, *d_addr;
@@ -1472,8 +1485,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
s_size = d_size = class->size;
- obj_to_location(src, &s_page, &s_objidx);
- obj_to_location(dst, &d_page, &d_objidx);
+ obj_to_location(src, &s_zpdesc, &s_objidx);
+ obj_to_location(dst, &d_zpdesc, &d_objidx);
s_off = offset_in_page(class->size * s_objidx);
d_off = offset_in_page(class->size * d_objidx);
@@ -1484,8 +1497,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (d_off + class->size > PAGE_SIZE)
d_size = PAGE_SIZE - d_off;
- s_addr = kmap_local_page(s_page);
- d_addr = kmap_local_page(d_page);
+ s_addr = kmap_local_zpdesc(s_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
while (1) {
size = min(s_size, d_size);
@@ -1510,17 +1523,17 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
if (s_off >= PAGE_SIZE) {
kunmap_local(d_addr);
kunmap_local(s_addr);
- s_page = get_next_page(s_page);
- s_addr = kmap_local_page(s_page);
- d_addr = kmap_local_page(d_page);
+ s_zpdesc = get_next_zpdesc(s_zpdesc);
+ s_addr = kmap_local_zpdesc(s_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
s_size = class->size - written;
s_off = 0;
}
if (d_off >= PAGE_SIZE) {
kunmap_local(d_addr);
- d_page = get_next_page(d_page);
- d_addr = kmap_local_page(d_page);
+ d_zpdesc = get_next_zpdesc(d_zpdesc);
+ d_addr = kmap_local_zpdesc(d_zpdesc);
d_size = class->size - written;
d_off = 0;
}
@@ -1535,18 +1548,18 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int *obj_idx)
+ struct zpdesc *zpdesc, int *obj_idx)
{
unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
- void *addr = kmap_local_page(page);
+ void *addr = kmap_local_zpdesc(zpdesc);
- offset = get_first_obj_offset(page);
+ offset = get_first_obj_offset(zpdesc);
offset += class->size * index;
while (offset < PAGE_SIZE) {
- if (obj_allocated(page, addr + offset, &handle))
+ if (obj_allocated(zpdesc, addr + offset, &handle))
break;
offset += class->size;
@@ -1566,14 +1579,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
unsigned long used_obj, free_obj;
unsigned long handle;
int obj_idx = 0;
- struct page *s_page = get_first_page(src_zspage);
+ struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage);
struct size_class *class = pool->size_class[src_zspage->class];
while (1) {
- handle = find_alloced_obj(class, s_page, &obj_idx);
+ handle = find_alloced_obj(class, s_zpdesc, &obj_idx);
if (!handle) {
- s_page = get_next_page(s_page);
- if (!s_page)
+ s_zpdesc = get_next_zpdesc(s_zpdesc);
+ if (!s_zpdesc)
break;
obj_idx = 0;
continue;
@@ -1653,7 +1666,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
*/
static void lock_zspage(struct zspage *zspage)
{
- struct page *curr_page, *page;
+ struct zpdesc *curr_zpdesc, *zpdesc;
/*
* Pages we haven't locked yet can be migrated off the list while we're
@@ -1665,24 +1678,24 @@ static void lock_zspage(struct zspage *zspage)
*/
while (1) {
migrate_read_lock(zspage);
- page = get_first_page(zspage);
- if (trylock_page(page))
+ zpdesc = get_first_zpdesc(zspage);
+ if (zpdesc_trylock(zpdesc))
break;
- get_page(page);
+ zpdesc_get(zpdesc);
migrate_read_unlock(zspage);
- wait_on_page_locked(page);
- put_page(page);
+ zpdesc_wait_locked(zpdesc);
+ zpdesc_put(zpdesc);
}
- curr_page = page;
- while ((page = get_next_page(curr_page))) {
- if (trylock_page(page)) {
- curr_page = page;
+ curr_zpdesc = zpdesc;
+ while ((zpdesc = get_next_zpdesc(curr_zpdesc))) {
+ if (zpdesc_trylock(zpdesc)) {
+ curr_zpdesc = zpdesc;
} else {
- get_page(page);
+ zpdesc_get(zpdesc);
migrate_read_unlock(zspage);
- wait_on_page_locked(page);
- put_page(page);
+ zpdesc_wait_locked(zpdesc);
+ zpdesc_put(zpdesc);
migrate_read_lock(zspage);
}
}
@@ -1720,26 +1733,28 @@ static void migrate_write_unlock(struct zspage *zspage)
static const struct movable_operations zsmalloc_mops;
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
- struct page *newpage, struct page *oldpage)
+ struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
{
- struct page *page;
- struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ struct zpdesc *zpdesc;
+ struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ unsigned int first_obj_offset;
int idx = 0;
- page = get_first_page(zspage);
+ zpdesc = get_first_zpdesc(zspage);
do {
- if (page == oldpage)
- pages[idx] = newpage;
+ if (zpdesc == oldzpdesc)
+ zpdescs[idx] = newzpdesc;
else
- pages[idx] = page;
+ zpdescs[idx] = zpdesc;
idx++;
- } while ((page = get_next_page(page)) != NULL);
+ } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
- create_page_chain(class, zspage, pages);
- set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+ create_page_chain(class, zspage, zpdescs);
+ first_obj_offset = get_first_obj_offset(oldzpdesc);
+ set_first_obj_offset(newzpdesc, first_obj_offset);
if (unlikely(ZsHugePage(zspage)))
- newpage->index = oldpage->index;
- __SetPageMovable(newpage, &zsmalloc_mops);
+ newzpdesc->handle = oldzpdesc->handle;
+ __zpdesc_set_movable(newzpdesc, &zsmalloc_mops);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
@@ -1759,20 +1774,22 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
struct zs_pool *pool;
struct size_class *class;
struct zspage *zspage;
- struct page *dummy;
+ struct zpdesc *dummy;
+ struct zpdesc *newzpdesc = page_zpdesc(newpage);
+ struct zpdesc *zpdesc = page_zpdesc(page);
void *s_addr, *d_addr, *addr;
unsigned int offset;
unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
/* We're committed, tell the world that this is a Zsmalloc page. */
- __SetPageZsmalloc(newpage);
+ __zpdesc_set_zsmalloc(newzpdesc);
/* The page is locked, so this pointer must remain valid */
- zspage = get_zspage(page);
+ zspage = get_zspage(zpdesc);
pool = zspage->pool;
/*
@@ -1789,30 +1806,29 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
/* the migrate_write_lock protects zpage access via zs_map_object */
migrate_write_lock(zspage);
- offset = get_first_obj_offset(page);
- s_addr = kmap_local_page(page);
+ offset = get_first_obj_offset(zpdesc);
+ s_addr = kmap_local_zpdesc(zpdesc);
/*
* Here, any user cannot access all objects in the zspage so let's move.
*/
- d_addr = kmap_local_page(newpage);
+ d_addr = kmap_local_zpdesc(newzpdesc);
copy_page(d_addr, s_addr);
kunmap_local(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
- if (obj_allocated(page, addr, &handle)) {
+ if (obj_allocated(zpdesc, addr, &handle)) {
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
- new_obj = (unsigned long)location_to_obj(newpage,
- obj_idx);
+ new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx);
record_obj(handle, new_obj);
}
}
kunmap_local(s_addr);
- replace_sub_page(class, zspage, newpage, page);
+ replace_sub_page(class, zspage, newzpdesc, zpdesc);
/*
* Since we complete the data copy and set up new zspage structure,
* it's okay to release migration_lock.
@@ -1821,14 +1837,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
spin_unlock(&class->lock);
migrate_write_unlock(zspage);
- get_page(newpage);
- if (page_zone(newpage) != page_zone(page)) {
- dec_zone_page_state(page, NR_ZSPAGES);
- inc_zone_page_state(newpage, NR_ZSPAGES);
+ zpdesc_get(newzpdesc);
+ if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
+ zpdesc_dec_zone_page_state(zpdesc);
+ zpdesc_inc_zone_page_state(newzpdesc);
}
- reset_page(page);
- put_page(page);
+ reset_zpdesc(zpdesc);
+ zpdesc_put(zpdesc);
return MIGRATEPAGE_SUCCESS;
}
@@ -1897,13 +1913,13 @@ static void init_deferred_free(struct zs_pool *pool)
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
{
- struct page *page = get_first_page(zspage);
+ struct zpdesc *zpdesc = get_first_zpdesc(zspage);
do {
- WARN_ON(!trylock_page(page));
- __SetPageMovable(page, &zsmalloc_mops);
- unlock_page(page);
- } while ((page = get_next_page(page)) != NULL);
+ WARN_ON(!zpdesc_trylock(zpdesc));
+ __zpdesc_set_movable(zpdesc, &zsmalloc_mops);
+ zpdesc_unlock(zpdesc);
+ } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
}
#else
static inline void zs_flush_migration(struct zs_pool *pool) { }
diff --git a/mm/zswap.c b/mm/zswap.c
index b84c20d889b1..23365e76a3ce 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -1192,7 +1192,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
/*
* It's safe to drop the lock here because we return either
- * LRU_REMOVED_RETRY or LRU_RETRY.
+ * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP.
*/
spin_unlock(&l->lock);
@@ -1445,9 +1445,9 @@ resched:
* main API
**********************************/
-static ssize_t zswap_store_page(struct page *page,
- struct obj_cgroup *objcg,
- struct zswap_pool *pool)
+static bool zswap_store_page(struct page *page,
+ struct obj_cgroup *objcg,
+ struct zswap_pool *pool)
{
swp_entry_t page_swpentry = page_swap_entry(page);
struct zswap_entry *entry, *old;
@@ -1456,7 +1456,7 @@ static ssize_t zswap_store_page(struct page *page,
entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
- return -EINVAL;
+ return false;
}
if (!zswap_compress(page, entry, pool))
@@ -1483,13 +1483,17 @@ static ssize_t zswap_store_page(struct page *page,
/*
* The entry is successfully compressed and stored in the tree, there is
- * no further possibility of failure. Grab refs to the pool and objcg.
- * These refs will be dropped by zswap_entry_free() when the entry is
- * removed from the tree.
+ * no further possibility of failure. Grab refs to the pool and objcg,
+ * charge zswap memory, and increment zswap_stored_pages.
+ * The opposite actions will be performed by zswap_entry_free()
+ * when the entry is removed from the tree.
*/
zswap_pool_get(pool);
- if (objcg)
+ if (objcg) {
obj_cgroup_get(objcg);
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ }
+ atomic_long_inc(&zswap_stored_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1510,13 +1514,13 @@ static ssize_t zswap_store_page(struct page *page,
zswap_lru_add(&zswap_list_lru, entry);
}
- return entry->length;
+ return true;
store_failed:
zpool_free(pool->zpool, entry->handle);
compress_failed:
zswap_entry_cache_free(entry);
- return -EINVAL;
+ return false;
}
bool zswap_store(struct folio *folio)
@@ -1526,7 +1530,6 @@ bool zswap_store(struct folio *folio)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
struct zswap_pool *pool;
- size_t compressed_bytes = 0;
bool ret = false;
long index;
@@ -1564,20 +1567,14 @@ bool zswap_store(struct folio *folio)
for (index = 0; index < nr_pages; ++index) {
struct page *page = folio_page(folio, index);
- ssize_t bytes;
- bytes = zswap_store_page(page, objcg, pool);
- if (bytes < 0)
+ if (!zswap_store_page(page, objcg, pool))
goto put_pool;
- compressed_bytes += bytes;
}
- if (objcg) {
- obj_cgroup_charge_zswap(objcg, compressed_bytes);
+ if (objcg)
count_objcg_events(objcg, ZSWPOUT, nr_pages);
- }
- atomic_long_add(nr_pages, &zswap_stored_pages);
count_vm_events(ZSWPOUT, nr_pages);
ret = true;