summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 17:53:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-10-10 17:53:04 -0700
commit27bc50fc90647bbf7b734c3fc306a5e61350da53 (patch)
tree75fc525fbfec8c07a97a7875a89592317bcad4ca /include/linux
parent70442fc54e6889a2a77f0e9554e8188a1557f00e (diff)
parentbbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 (diff)
downloadlwn-27bc50fc90647bbf7b734c3fc306a5e61350da53.tar.gz
lwn-27bc50fc90647bbf7b734c3fc306a5e61350da53.zip
Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Yu Zhao's Multi-Gen LRU patches are here. They've been under test in linux-next for a couple of months without, to my knowledge, any negative reports (or any positive ones, come to that). - Also the Maple Tree from Liam Howlett. An overlapping range-based tree for vmas. It it apparently slightly more efficient in its own right, but is mainly targeted at enabling work to reduce mmap_lock contention. Liam has identified a number of other tree users in the kernel which could be beneficially onverted to mapletrees. Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat at [1]. This has yet to be addressed due to Liam's unfortunately timed vacation. He is now back and we'll get this fixed up. - Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses clang-generated instrumentation to detect used-unintialized bugs down to the single bit level. KMSAN keeps finding bugs. New ones, as well as the legacy ones. - Yang Shi adds a userspace mechanism (madvise) to induce a collapse of memory into THPs. - Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to support file/shmem-backed pages. - userfaultfd updates from Axel Rasmussen - zsmalloc cleanups from Alexey Romanov - cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and memory-failure - Huang Ying adds enhancements to NUMA balancing memory tiering mode's page promotion, with a new way of detecting hot pages. - memcg updates from Shakeel Butt: charging optimizations and reduced memory consumption. - memcg cleanups from Kairui Song. - memcg fixes and cleanups from Johannes Weiner. - Vishal Moola provides more folio conversions - Zhang Yi removed ll_rw_block() :( - migration enhancements from Peter Xu - migration error-path bugfixes from Huang Ying - Aneesh Kumar added ability for a device driver to alter the memory tiering promotion paths. For optimizations by PMEM drivers, DRM drivers, etc. - vma merging improvements from Jakub Matěn. - NUMA hinting cleanups from David Hildenbrand. - xu xin added aditional userspace visibility into KSM merging activity. - THP & KSM code consolidation from Qi Zheng. - more folio work from Matthew Wilcox. - KASAN updates from Andrey Konovalov. - DAMON cleanups from Kaixu Xia. - DAMON work from SeongJae Park: fixes, cleanups. - hugetlb sysfs cleanups from Muchun Song. - Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core. Link: https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com [1] * tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (555 commits) hugetlb: allocate vma lock for all sharable vmas hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb: fix vma lock handling during split vma and range unmapping mglru: mm/vmscan.c: fix imprecise comments mm/mglru: don't sync disk for each aging cycle mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol mm: memcontrol: use do_memsw_account() in a few more places mm: memcontrol: deprecate swapaccounting=0 mode mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled mm/secretmem: remove reduntant return value mm/hugetlb: add available_huge_pages() func mm: remove unused inline functions from include/linux/mm_inline.h selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd selftests/vm: add thp collapse shmem testing selftests/vm: add thp collapse file and tmpfs testing selftests/vm: modularize thp collapse memory operations selftests/vm: dedup THP helpers mm/khugepaged: add tracepoint to hpage_collapse_scan_file() mm/madvise: add file and shmem support to MADV_COLLAPSE ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/buffer_head.h48
-rw-r--r--include/linux/cache.h13
-rw-r--r--include/linux/cgroup.h15
-rw-r--r--include/linux/compiler-clang.h23
-rw-r--r--include/linux/compiler-gcc.h6
-rw-r--r--include/linux/compiler_types.h3
-rw-r--r--include/linux/damon.h86
-rw-r--r--include/linux/delayacct.h16
-rw-r--r--include/linux/fortify-string.h2
-rw-r--r--include/linux/gfp.h26
-rw-r--r--include/linux/highmem.h3
-rw-r--r--include/linux/huge_mm.h28
-rw-r--r--include/linux/hugetlb.h63
-rw-r--r--include/linux/hugetlb_cgroup.h19
-rw-r--r--include/linux/instrumented.h59
-rw-r--r--include/linux/kasan.h55
-rw-r--r--include/linux/khugepaged.h13
-rw-r--r--include/linux/kmsan-checks.h83
-rw-r--r--include/linux/kmsan.h330
-rw-r--r--include/linux/kmsan_types.h35
-rw-r--r--include/linux/ksm.h3
-rw-r--r--include/linux/maple_tree.h685
-rw-r--r--include/linux/memcontrol.h99
-rw-r--r--include/linux/memory-tiers.h102
-rw-r--r--include/linux/memory_hotplug.h30
-rw-r--r--include/linux/mempolicy.h13
-rw-r--r--include/linux/migrate.h15
-rw-r--r--include/linux/mm.h161
-rw-r--r--include/linux/mm_inline.h242
-rw-r--r--include/linux/mm_types.h178
-rw-r--r--include/linux/mm_types_task.h12
-rw-r--r--include/linux/mmzone.h281
-rw-r--r--include/linux/node.h29
-rw-r--r--include/linux/nodemask.h14
-rw-r--r--include/linux/oom.h11
-rw-r--r--include/linux/page-flags-layout.h16
-rw-r--r--include/linux/page-flags.h4
-rw-r--r--include/linux/page_counter.h26
-rw-r--r--include/linux/page_ext.h24
-rw-r--r--include/linux/page_idle.h34
-rw-r--r--include/linux/pageblock-flags.h4
-rw-r--r--include/linux/pagemap.h14
-rw-r--r--include/linux/pagewalk.h10
-rw-r--r--include/linux/pgtable.h26
-rw-r--r--include/linux/rmap.h73
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/linux/sched/coredump.h7
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/shmem_fs.h16
-rw-r--r--include/linux/slab.h8
-rw-r--r--include/linux/stackdepot.h8
-rw-r--r--include/linux/swap.h41
-rw-r--r--include/linux/swap_cgroup.h4
-rw-r--r--include/linux/swapfile.h7
-rw-r--r--include/linux/swapops.h150
-rw-r--r--include/linux/uaccess.h19
-rw-r--r--include/linux/userfaultfd_k.h7
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmacache.h28
-rw-r--r--include/linux/vmstat.h6
-rw-r--r--include/linux/writeback.h8
61 files changed, 2793 insertions, 567 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 06089390d81d..33fa5e94aa80 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -225,8 +225,6 @@ struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
-void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size,
- gfp_t gfp);
struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
@@ -236,7 +234,6 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
-void ll_rw_block(blk_opf_t, int, struct buffer_head * bh[]);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
@@ -244,7 +241,9 @@ void submit_bh(blk_opf_t, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
-int bh_submit_read(struct buffer_head *bh);
+int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
+void __bh_read_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags, bool force_lock);
extern int buffer_heads_over_limit;
@@ -351,12 +350,6 @@ sb_breadahead(struct super_block *sb, sector_t block)
__breadahead(sb->s_bdev, block, sb->s_blocksize);
}
-static inline void
-sb_breadahead_unmovable(struct super_block *sb, sector_t block)
-{
- __breadahead_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
-}
-
static inline struct buffer_head *
sb_getblk(struct super_block *sb, sector_t block)
{
@@ -418,6 +411,41 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
}
+static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
+ if (!buffer_uptodate(bh))
+ __bh_read(bh, op_flags, false);
+ else
+ unlock_buffer(bh);
+ }
+}
+
+static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (!bh_uptodate_or_lock(bh))
+ __bh_read(bh, op_flags, false);
+}
+
+/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
+static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
+{
+ if (bh_uptodate_or_lock(bh))
+ return 1;
+ return __bh_read(bh, op_flags, true);
+}
+
+static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
+{
+ __bh_read_batch(nr, bhs, 0, true);
+}
+
+static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
+ blk_opf_t op_flags)
+{
+ __bh_read_batch(nr, bhs, op_flags, false);
+}
+
/**
* __bread() - reads a specified block and returns the bh
* @bdev: the block_device to read from
diff --git a/include/linux/cache.h b/include/linux/cache.h
index d742c57eaee5..5da1bbd96154 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -85,4 +85,17 @@
#define cache_line_size() L1_CACHE_BYTES
#endif
+/*
+ * Helper to add padding within a struct to ensure data fall into separate
+ * cachelines.
+ */
+#if defined(CONFIG_SMP)
+struct cacheline_padding {
+ char x[0];
+} ____cacheline_internodealigned_in_smp;
+#define CACHELINE_PADDING(name) struct cacheline_padding name
+#else
+#define CACHELINE_PADDING(name)
+#endif
+
#endif /* __LINUX_CACHE_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 398f0bce7c21..23b102b4349e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -707,6 +718,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 42e55579d649..6cfd6902bd5b 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -51,6 +51,29 @@
#define __no_sanitize_undefined
#endif
+#if __has_feature(memory_sanitizer)
+#define __SANITIZE_MEMORY__
+/*
+ * Unlike other sanitizers, KMSAN still inserts code into functions marked with
+ * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation
+ * provides the behavior consistent with other __no_sanitize_ attributes,
+ * guaranteeing that __no_sanitize_memory functions remain uninstrumented.
+ */
+#define __no_sanitize_memory __disable_sanitizer_instrumentation
+
+/*
+ * The __no_kmsan_checks attribute ensures that a function does not produce
+ * false positive reports by:
+ * - initializing all local variables and memory stores in this function;
+ * - skipping all shadow checks;
+ * - passing initialized arguments to this function's callees.
+ */
+#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory")))
+#else
+#define __no_sanitize_memory
+#define __no_kmsan_checks
+#endif
+
/*
* Support for __has_feature(coverage_sanitizer) was added in Clang 13 together
* with no_sanitize("coverage"). Prior versions of Clang support coverage
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 9b157b71036f..f55a37efdb97 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -115,6 +115,12 @@
#endif
/*
+ * GCC does not support KMSAN.
+ */
+#define __no_sanitize_memory
+#define __no_kmsan_checks
+
+/*
* Turn individual warnings and errors on and off locally, depending
* on version.
*/
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 74e04ecd4c89..eb0466236661 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -233,7 +233,8 @@ struct ftrace_likely_data {
/* Section for code which can't be instrumented at all */
#define noinstr \
noinline notrace __attribute((__section__(".noinstr.text"))) \
- __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage
+ __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \
+ __no_sanitize_memory
#endif /* __KERNEL__ */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 7b1f4a488230..ed5470f50bab 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -216,13 +216,26 @@ struct damos_stat {
};
/**
- * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * struct damos_access_pattern - Target access pattern of the given scheme.
* @min_sz_region: Minimum size of target regions.
* @max_sz_region: Maximum size of target regions.
* @min_nr_accesses: Minimum ``->nr_accesses`` of target regions.
* @max_nr_accesses: Maximum ``->nr_accesses`` of target regions.
* @min_age_region: Minimum age of target regions.
* @max_age_region: Maximum age of target regions.
+ */
+struct damos_access_pattern {
+ unsigned long min_sz_region;
+ unsigned long max_sz_region;
+ unsigned int min_nr_accesses;
+ unsigned int max_nr_accesses;
+ unsigned int min_age_region;
+ unsigned int max_age_region;
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @pattern: Access pattern of target regions.
* @action: &damo_action to be applied to the target regions.
* @quota: Control the aggressiveness of this scheme.
* @wmarks: Watermarks for automated (in)activation of this scheme.
@@ -230,10 +243,8 @@ struct damos_stat {
* @list: List head for siblings.
*
* For each aggregation interval, DAMON finds regions which fit in the
- * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
- * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
- * those. To avoid consuming too much CPU time or IO resources for the
- * &action, &quota is used.
+ * &pattern and applies &action to those. To avoid consuming too much
+ * CPU time or IO resources for the &action, &quota is used.
*
* To do the work only when needed, schemes can be activated for specific
* system situations using &wmarks. If all schemes that registered to the
@@ -248,12 +259,7 @@ struct damos_stat {
* &action is applied.
*/
struct damos {
- unsigned long min_sz_region;
- unsigned long max_sz_region;
- unsigned int min_nr_accesses;
- unsigned int max_nr_accesses;
- unsigned int min_age_region;
- unsigned int max_age_region;
+ struct damos_access_pattern pattern;
enum damos_action action;
struct damos_quota quota;
struct damos_watermarks wmarks;
@@ -340,7 +346,7 @@ struct damon_operations {
unsigned long (*apply_scheme)(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme);
- bool (*target_valid)(void *target);
+ bool (*target_valid)(struct damon_target *t);
void (*cleanup)(struct damon_ctx *context);
};
@@ -383,13 +389,15 @@ struct damon_callback {
};
/**
- * struct damon_ctx - Represents a context for each monitoring. This is the
- * main interface that allows users to set the attributes and get the results
- * of the monitoring.
+ * struct damon_attrs - Monitoring attributes for accuracy/overhead control.
*
* @sample_interval: The time between access samplings.
* @aggr_interval: The time between monitor results aggregations.
* @ops_update_interval: The time between monitoring operations updates.
+ * @min_nr_regions: The minimum number of adaptive monitoring
+ * regions.
+ * @max_nr_regions: The maximum number of adaptive monitoring
+ * regions.
*
* For each @sample_interval, DAMON checks whether each region is accessed or
* not. It aggregates and keeps the access information (number of accesses to
@@ -399,7 +407,21 @@ struct damon_callback {
* @ops_update_interval. All time intervals are in micro-seconds.
* Please refer to &struct damon_operations and &struct damon_callback for more
* detail.
+ */
+struct damon_attrs {
+ unsigned long sample_interval;
+ unsigned long aggr_interval;
+ unsigned long ops_update_interval;
+ unsigned long min_nr_regions;
+ unsigned long max_nr_regions;
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring. This is the
+ * main interface that allows users to set the attributes and get the results
+ * of the monitoring.
*
+ * @attrs: Monitoring attributes for accuracy/overhead control.
* @kdamond: Kernel thread who does the monitoring.
* @kdamond_lock: Mutex for the synchronizations with @kdamond.
*
@@ -421,15 +443,11 @@ struct damon_callback {
* @ops: Set of monitoring operations for given use cases.
* @callback: Set of callbacks for monitoring events notifications.
*
- * @min_nr_regions: The minimum number of adaptive monitoring regions.
- * @max_nr_regions: The maximum number of adaptive monitoring regions.
* @adaptive_targets: Head of monitoring targets (&damon_target) list.
* @schemes: Head of schemes (&damos) list.
*/
struct damon_ctx {
- unsigned long sample_interval;
- unsigned long aggr_interval;
- unsigned long ops_update_interval;
+ struct damon_attrs attrs;
/* private: internal use only */
struct timespec64 last_aggregation;
@@ -442,8 +460,6 @@ struct damon_ctx {
struct damon_operations ops;
struct damon_callback callback;
- unsigned long min_nr_regions;
- unsigned long max_nr_regions;
struct list_head adaptive_targets;
struct list_head schemes;
};
@@ -463,9 +479,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t)
return list_last_entry(&t->regions_list, struct damon_region, list);
}
+static inline struct damon_region *damon_first_region(struct damon_target *t)
+{
+ return list_first_entry(&t->regions_list, struct damon_region, list);
+}
+
#define damon_for_each_region(r, t) \
list_for_each_entry(r, &t->regions_list, list)
+#define damon_for_each_region_from(r, t) \
+ list_for_each_entry_from(r, &t->regions_list, list)
+
#define damon_for_each_region_safe(r, next, t) \
list_for_each_entry_safe(r, next, &t->regions_list, list)
@@ -501,12 +525,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
unsigned int nr_ranges);
-struct damos *damon_new_scheme(
- unsigned long min_sz_region, unsigned long max_sz_region,
- unsigned int min_nr_accesses, unsigned int max_nr_accesses,
- unsigned int min_age_region, unsigned int max_age_region,
- enum damos_action action, struct damos_quota *quota,
- struct damos_watermarks *wmarks);
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+ enum damos_action action, struct damos_quota *quota,
+ struct damos_watermarks *wmarks);
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
void damon_destroy_scheme(struct damos *s);
@@ -519,10 +540,8 @@ unsigned int damon_nr_regions(struct damon_target *t);
struct damon_ctx *damon_new_ctx(void);
void damon_destroy_ctx(struct damon_ctx *ctx);
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
- unsigned long aggr_int, unsigned long ops_upd_int,
- unsigned long min_nr_reg, unsigned long max_nr_reg);
-int damon_set_schemes(struct damon_ctx *ctx,
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs);
+void damon_set_schemes(struct damon_ctx *ctx,
struct damos **schemes, ssize_t nr_schemes);
int damon_nr_running_ctxs(void);
bool damon_is_registered_ops(enum damon_ops_id id);
@@ -538,6 +557,9 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+ unsigned long *start, unsigned long *end);
+
#endif /* CONFIG_DAMON */
#endif /* _DAMON_H */
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 58aea2d7385c..0da97dba9ef8 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -73,8 +73,8 @@ extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
-extern void __delayacct_thrashing_start(void);
-extern void __delayacct_thrashing_end(void);
+extern void __delayacct_thrashing_start(bool *in_thrashing);
+extern void __delayacct_thrashing_end(bool *in_thrashing);
extern void __delayacct_swapin_start(void);
extern void __delayacct_swapin_end(void);
extern void __delayacct_compact_start(void);
@@ -143,22 +143,22 @@ static inline void delayacct_freepages_end(void)
__delayacct_freepages_end();
}
-static inline void delayacct_thrashing_start(void)
+static inline void delayacct_thrashing_start(bool *in_thrashing)
{
if (!static_branch_unlikely(&delayacct_key))
return;
if (current->delays)
- __delayacct_thrashing_start();
+ __delayacct_thrashing_start(in_thrashing);
}
-static inline void delayacct_thrashing_end(void)
+static inline void delayacct_thrashing_end(bool *in_thrashing)
{
if (!static_branch_unlikely(&delayacct_key))
return;
if (current->delays)
- __delayacct_thrashing_end();
+ __delayacct_thrashing_end(in_thrashing);
}
static inline void delayacct_swapin_start(void)
@@ -237,9 +237,9 @@ static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
-static inline void delayacct_thrashing_start(void)
+static inline void delayacct_thrashing_start(bool *in_thrashing)
{}
-static inline void delayacct_thrashing_end(void)
+static inline void delayacct_thrashing_end(bool *in_thrashing)
{}
static inline void delayacct_swapin_start(void)
{}
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index b62c90cfafaf..4029fe368a4f 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -328,8 +328,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
* __struct_size() vs __member_size() must be captured here to avoid
* evaluating argument side-effects further into the macro layers.
*/
+#ifndef CONFIG_KMSAN
#define memset(p, c, s) __fortify_memset_chk(p, c, s, \
__struct_size(p), __member_size(p))
+#endif
/*
* To make sure the compiler can enforce protection against buffer overflows,
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f314be58fa77..ef4aea3b356e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -18,6 +18,9 @@ static inline int gfp_migratetype(const gfp_t gfp_flags)
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
+ BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
+ BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
+ GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
@@ -33,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}
-/**
- * gfpflags_normal_context - is gfp_flags a normal sleepable context?
- * @gfp_flags: gfp_flags to test
- *
- * Test whether @gfp_flags indicates that the allocation is from the
- * %current context and allowed to sleep.
- *
- * An allocation being allowed to block doesn't mean it owns the %current
- * context. When direct reclaim path tries to allocate memory, the
- * allocation context is nested inside whatever %current was doing at the
- * time of the original allocation. The nested allocation may be allowed
- * to block but modifying anything %current owns can corrupt the outer
- * context's expectations.
- *
- * %true result from this function indicates that the allocation context
- * can sleep and use anything that's associated with %current.
- */
-static inline bool gfpflags_normal_context(const gfp_t gfp_flags)
-{
- return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) ==
- __GFP_DIRECT_RECLAIM;
-}
-
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 25679035ca28..e9912da5441b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/cacheflush.h>
+#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
@@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
vfrom = kmap_local_page(from);
vto = kmap_local_page(to);
copy_user_page(vto, vfrom, vaddr, to);
+ kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
kunmap_local(vto);
kunmap_local(vfrom);
}
@@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from)
vfrom = kmap_local_page(from);
vto = kmap_local_page(to);
copy_page(vto, vfrom);
+ kmsan_copy_page_meta(to, from);
kunmap_local(vto);
kunmap_local(vfrom);
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 768e5261fdae..a1341fdcf666 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf);
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs);
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
@@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
int advice);
+int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
}
static inline bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+ unsigned long vm_flags, bool smaps,
+ bool in_pf, bool enforce_sysfs)
{
return false;
}
@@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma,
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
- BUG();
- return 0;
+ return -EINVAL;
+}
+
+static inline int madvise_collapse(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ return -EINVAL;
}
+
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
@@ -435,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio,
return split_huge_page_to_list(&folio->page, list);
}
+static inline int split_folio(struct folio *folio)
+{
+ return split_folio_to_list(folio, NULL);
+}
+
/*
* archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
* limitations in the implementation like arm64 MTE can override this to
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1ec1535be04f..95fda85aa195 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,6 +16,7 @@
struct ctl_table;
struct user_struct;
struct mmu_gather;
+struct node;
#ifndef CONFIG_ARCH_HAS_HUGEPD
typedef struct { unsigned long pd; } hugepd_t;
@@ -114,6 +115,12 @@ struct file_region {
#endif
};
+struct hugetlb_vma_lock {
+ struct kref refs;
+ struct rw_semaphore rw_sema;
+ struct vm_area_struct *vma;
+};
+
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
@@ -126,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
@@ -214,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
pgd_t *pgd, int flags);
+void hugetlb_vma_lock_read(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
+void hugetlb_vma_lock_write(struct vm_area_struct *vma);
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
+void hugetlb_vma_lock_release(struct kref *kref);
+
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pud);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -225,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
-static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}
@@ -336,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file,
return -EINVAL;
}
+static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
static inline int pmd_huge(pmd_t pmd)
{
return 0;
@@ -665,7 +705,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask);
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
unsigned long address, struct page *page);
@@ -935,6 +975,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
}
#endif
+#ifdef CONFIG_NUMA
+void hugetlb_register_node(struct node *node);
+void hugetlb_unregister_node(struct node *node);
+#endif
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
@@ -1109,6 +1154,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
}
+
+static inline void hugetlb_register_node(struct node *node)
+{
+}
+
+static inline void hugetlb_unregister_node(struct node *node)
+{
+}
#endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
@@ -1123,14 +1176,10 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
-extern void __init hugetlb_cma_check(void);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
-static inline __init void hugetlb_cma_check(void)
-{
-}
#endif
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 379344828e78..630cd255d0cf 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -90,32 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
return __hugetlb_cgroup_from_page(page, true);
}
-static inline int __set_hugetlb_cgroup(struct page *page,
+static inline void __set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg, bool rsvd)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
- return -1;
+ return;
if (rsvd)
set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
(unsigned long)h_cg);
else
set_page_private(page + SUBPAGE_INDEX_CGROUP,
(unsigned long)h_cg);
- return 0;
}
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return __set_hugetlb_cgroup(page, h_cg, false);
+ __set_hugetlb_cgroup(page, h_cg, false);
}
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return __set_hugetlb_cgroup(page, h_cg, true);
+ __set_hugetlb_cgroup(page, h_cg, true);
}
static inline bool hugetlb_cgroup_disabled(void)
@@ -199,16 +198,14 @@ hugetlb_cgroup_from_page_rsvd(struct page *page)
return NULL;
}
-static inline int set_hugetlb_cgroup(struct page *page,
+static inline void set_hugetlb_cgroup(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return 0;
}
-static inline int set_hugetlb_cgroup_rsvd(struct page *page,
+static inline void set_hugetlb_cgroup_rsvd(struct page *page,
struct hugetlb_cgroup *h_cg)
{
- return 0;
}
static inline bool hugetlb_cgroup_disabled(void)
diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index 42faebbaa202..501fa8486749 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -2,7 +2,7 @@
/*
* This header provides generic wrappers for memory access instrumentation that
- * the compiler cannot emit for: KASAN, KCSAN.
+ * the compiler cannot emit for: KASAN, KCSAN, KMSAN.
*/
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H
@@ -10,6 +10,7 @@
#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
+#include <linux/kmsan-checks.h>
#include <linux/types.h>
/**
@@ -117,10 +118,11 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
kasan_check_read(from, n);
kcsan_check_read(from, n);
+ kmsan_copy_to_user(to, from, n, 0);
}
/**
- * instrument_copy_from_user - instrument writes of copy_from_user
+ * instrument_copy_from_user_before - add instrumentation before copy_from_user
*
* Instrument writes to kernel memory, that are due to copy_from_user (and
* variants). The instrumentation should be inserted before the accesses.
@@ -130,10 +132,61 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
* @n number of bytes to copy
*/
static __always_inline void
-instrument_copy_from_user(const void *to, const void __user *from, unsigned long n)
+instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
{
kasan_check_write(to, n);
kcsan_check_write(to, n);
}
+/**
+ * instrument_copy_from_user_after - add instrumentation after copy_from_user
+ *
+ * Instrument writes to kernel memory, that are due to copy_from_user (and
+ * variants). The instrumentation should be inserted after the accesses.
+ *
+ * @to destination address
+ * @from source address
+ * @n number of bytes to copy
+ * @left number of bytes not copied (as returned by copy_from_user)
+ */
+static __always_inline void
+instrument_copy_from_user_after(const void *to, const void __user *from,
+ unsigned long n, unsigned long left)
+{
+ kmsan_unpoison_memory(to, n - left);
+}
+
+/**
+ * instrument_get_user() - add instrumentation to get_user()-like macros
+ *
+ * get_user() and friends are fragile, so it may depend on the implementation
+ * whether the instrumentation happens before or after the data is copied from
+ * the userspace.
+ *
+ * @to destination variable, may not be address-taken
+ */
+#define instrument_get_user(to) \
+({ \
+ u64 __tmp = (u64)(to); \
+ kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \
+ to = __tmp; \
+})
+
+
+/**
+ * instrument_put_user() - add instrumentation to put_user()-like macros
+ *
+ * put_user() and friends are fragile, so it may depend on the implementation
+ * whether the instrumentation happens before or after the data is copied from
+ * the userspace.
+ *
+ * @from source address
+ * @ptr userspace pointer to copy to
+ * @size number of bytes to copy
+ */
+#define instrument_put_user(from, ptr, size) \
+({ \
+ kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \
+})
+
#endif /* _LINUX_INSTRUMENTED_H */
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b092277bf48d..d811b3d7d2a1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -98,19 +98,13 @@ static inline bool kasan_has_integrated_init(void)
#ifdef CONFIG_KASAN
struct kasan_cache {
+#ifdef CONFIG_KASAN_GENERIC
int alloc_meta_offset;
int free_meta_offset;
+#endif
bool is_kmalloc;
};
-slab_flags_t __kasan_never_merge(void);
-static __always_inline slab_flags_t kasan_never_merge(void)
-{
- if (kasan_enabled())
- return __kasan_never_merge();
- return 0;
-}
-
void __kasan_unpoison_range(const void *addr, size_t size);
static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
{
@@ -134,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page,
__kasan_unpoison_pages(page, order, init);
}
-void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags);
-static __always_inline void kasan_cache_create(struct kmem_cache *cache,
- unsigned int *size, slab_flags_t *flags)
-{
- if (kasan_enabled())
- __kasan_cache_create(cache, size, flags);
-}
-
void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
{
@@ -150,14 +135,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache)
__kasan_cache_create_kmalloc(cache);
}
-size_t __kasan_metadata_size(struct kmem_cache *cache);
-static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
-{
- if (kasan_enabled())
- return __kasan_metadata_size(cache);
- return 0;
-}
-
void __kasan_poison_slab(struct slab *slab);
static __always_inline void kasan_poison_slab(struct slab *slab)
{
@@ -269,20 +246,12 @@ static __always_inline bool kasan_check_byte(const void *addr)
#else /* CONFIG_KASAN */
-static inline slab_flags_t kasan_never_merge(void)
-{
- return 0;
-}
static inline void kasan_unpoison_range(const void *address, size_t size) {}
static inline void kasan_poison_pages(struct page *page, unsigned int order,
bool init) {}
static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
bool init) {}
-static inline void kasan_cache_create(struct kmem_cache *cache,
- unsigned int *size,
- slab_flags_t *flags) {}
static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
-static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
static inline void kasan_poison_slab(struct slab *slab) {}
static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
void *object) {}
@@ -333,6 +302,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
#ifdef CONFIG_KASAN_GENERIC
+size_t kasan_metadata_size(struct kmem_cache *cache);
+slab_flags_t kasan_never_merge(void);
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags);
+
void kasan_cache_shrink(struct kmem_cache *cache);
void kasan_cache_shutdown(struct kmem_cache *cache);
void kasan_record_aux_stack(void *ptr);
@@ -340,6 +314,21 @@ void kasan_record_aux_stack_noalloc(void *ptr);
#else /* CONFIG_KASAN_GENERIC */
+/* Tag-based KASAN modes do not use per-object metadata. */
+static inline size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+ return 0;
+}
+/* And thus nothing prevents cache merging. */
+static inline slab_flags_t kasan_never_merge(void)
+{
+ return 0;
+}
+/* And no cache-related metadata initialization is required. */
+static inline void kasan_cache_create(struct kmem_cache *cache,
+ unsigned int *size,
+ slab_flags_t *flags) {}
+
static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
static inline void kasan_record_aux_stack(void *ptr) {}
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 384f034ae947..70162d707caf 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags);
extern void khugepaged_min_free_kbytes_update(void);
#ifdef CONFIG_SHMEM
-extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
+extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd);
#else
-static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, bool install_pmd)
{
+ return 0;
}
#endif
@@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags)
{
}
-static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr, bool install_pmd)
{
+ return 0;
}
static inline void khugepaged_min_free_kbytes_update(void)
diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h
new file mode 100644
index 000000000000..c4cae333deec
--- /dev/null
+++ b/include/linux/kmsan-checks.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN checks to be used for one-off annotations in subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef _LINUX_KMSAN_CHECKS_H
+#define _LINUX_KMSAN_CHECKS_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_poison_memory() - Mark the memory range as uninitialized.
+ * @address: address to start with.
+ * @size: size of buffer to poison.
+ * @flags: GFP flags for allocations done by this function.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * uninitialized. Error reports for this memory will reference the call site of
+ * kmsan_poison_memory() as origin.
+ */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags);
+
+/**
+ * kmsan_unpoison_memory() - Mark the memory range as initialized.
+ * @address: address to start with.
+ * @size: size of buffer to unpoison.
+ *
+ * Until other data is written to this range, KMSAN will treat it as
+ * initialized.
+ */
+void kmsan_unpoison_memory(const void *address, size_t size);
+
+/**
+ * kmsan_check_memory() - Check the memory range for being initialized.
+ * @address: address to start with.
+ * @size: size of buffer to check.
+ *
+ * If any piece of the given range is marked as uninitialized, KMSAN will report
+ * an error.
+ */
+void kmsan_check_memory(const void *address, size_t size);
+
+/**
+ * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace.
+ * @to: destination address in the userspace.
+ * @from: source address in the kernel.
+ * @to_copy: number of bytes to copy.
+ * @left: number of bytes not copied.
+ *
+ * If this is a real userspace data transfer, KMSAN checks the bytes that were
+ * actually copied to ensure there was no information leak. If @to belongs to
+ * the kernel space (which is possible for compat syscalls), KMSAN just copies
+ * the metadata.
+ */
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+ size_t left);
+
+#else
+
+static inline void kmsan_poison_memory(const void *address, size_t size,
+ gfp_t flags)
+{
+}
+static inline void kmsan_unpoison_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_check_memory(const void *address, size_t size)
+{
+}
+static inline void kmsan_copy_to_user(void __user *to, const void *from,
+ size_t to_copy, size_t left)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_CHECKS_H */
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
new file mode 100644
index 000000000000..e38ae3c34618
--- /dev/null
+++ b/include/linux/kmsan.h
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KMSAN API for subsystems.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_H
+#define _LINUX_KMSAN_H
+
+#include <linux/dma-direction.h>
+#include <linux/gfp.h>
+#include <linux/kmsan-checks.h>
+#include <linux/types.h>
+
+struct page;
+struct kmem_cache;
+struct task_struct;
+struct scatterlist;
+struct urb;
+
+#ifdef CONFIG_KMSAN
+
+/**
+ * kmsan_task_create() - Initialize KMSAN state for the task.
+ * @task: task to initialize.
+ */
+void kmsan_task_create(struct task_struct *task);
+
+/**
+ * kmsan_task_exit() - Notify KMSAN that a task has exited.
+ * @task: task about to finish.
+ */
+void kmsan_task_exit(struct task_struct *task);
+
+/**
+ * kmsan_init_shadow() - Initialize KMSAN shadow at boot time.
+ *
+ * Allocate and initialize KMSAN metadata for early allocations.
+ */
+void __init kmsan_init_shadow(void);
+
+/**
+ * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN.
+ */
+void __init kmsan_init_runtime(void);
+
+/**
+ * kmsan_memblock_free_pages() - handle freeing of memblock pages.
+ * @page: struct page to free.
+ * @order: order of @page.
+ *
+ * Freed pages are either returned to buddy allocator or held back to be used
+ * as metadata pages.
+ */
+bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order);
+
+/**
+ * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
+ * @page: struct page pointer returned by alloc_pages().
+ * @order: order of allocated struct page.
+ * @flags: GFP flags used by alloc_pages()
+ *
+ * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless
+ * @flags contain __GFP_ZERO.
+ */
+void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags);
+
+/**
+ * kmsan_free_page() - Notify KMSAN about a free_pages() call.
+ * @page: struct page pointer passed to free_pages().
+ * @order: order of deallocated struct page.
+ *
+ * KMSAN marks freed memory as uninitialized.
+ */
+void kmsan_free_page(struct page *page, unsigned int order);
+
+/**
+ * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages.
+ * @dst: destination page.
+ * @src: source page.
+ *
+ * KMSAN copies the contents of metadata pages for @src into the metadata pages
+ * for @dst. If @dst has no associated metadata pages, nothing happens.
+ * If @src has no associated metadata pages, @dst metadata pages are unpoisoned.
+ */
+void kmsan_copy_page_meta(struct page *dst, struct page *src);
+
+/**
+ * kmsan_slab_alloc() - Notify KMSAN about a slab allocation.
+ * @s: slab cache the object belongs to.
+ * @object: object pointer.
+ * @flags: GFP flags passed to the allocator.
+ *
+ * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the
+ * newly created object, marking it as initialized or uninitialized.
+ */
+void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
+
+/**
+ * kmsan_slab_free() - Notify KMSAN about a slab deallocation.
+ * @s: slab cache the object belongs to.
+ * @object: object pointer.
+ *
+ * KMSAN marks the freed object as uninitialized.
+ */
+void kmsan_slab_free(struct kmem_cache *s, void *object);
+
+/**
+ * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation.
+ * @ptr: object pointer.
+ * @size: object size.
+ * @flags: GFP flags passed to the allocator.
+ *
+ * Similar to kmsan_slab_alloc(), but for large allocations.
+ */
+void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
+
+/**
+ * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation.
+ * @ptr: object pointer.
+ *
+ * Similar to kmsan_slab_free(), but for large allocations.
+ */
+void kmsan_kfree_large(const void *ptr);
+
+/**
+ * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap.
+ * @start: start of vmapped range.
+ * @end: end of vmapped range.
+ * @prot: page protection flags used for vmap.
+ * @pages: array of pages.
+ * @page_shift: page_shift passed to vmap_range_noflush().
+ *
+ * KMSAN maps shadow and origin pages of @pages into contiguous ranges in
+ * vmalloc metadata address range.
+ */
+void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift);
+
+/**
+ * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
+ * @start: start of vunmapped range.
+ * @end: end of vunmapped range.
+ *
+ * KMSAN unmaps the contiguous metadata ranges created by
+ * kmsan_map_kernel_range_noflush().
+ */
+void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
+
+/**
+ * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call.
+ * @addr: range start.
+ * @end: range end.
+ * @phys_addr: physical range start.
+ * @prot: page protection flags used for ioremap_page_range().
+ * @page_shift: page_shift argument passed to vmap_range_noflush().
+ *
+ * KMSAN creates new metadata pages for the physical pages mapped into the
+ * virtual memory.
+ */
+void kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift);
+
+/**
+ * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
+ * @start: range start.
+ * @end: range end.
+ *
+ * KMSAN unmaps the metadata pages for the given range and, unlike for
+ * vunmap_page_range(), also deallocates them.
+ */
+void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
+
+/**
+ * kmsan_handle_dma() - Handle a DMA data transfer.
+ * @page: first page of the buffer.
+ * @offset: offset of the buffer within the first page.
+ * @size: buffer size.
+ * @dir: one of possible dma_data_direction values.
+ *
+ * Depending on @direction, KMSAN:
+ * * checks the buffer, if it is copied to device;
+ * * initializes the buffer, if it is copied from device;
+ * * does both, if this is a DMA_BIDIRECTIONAL transfer.
+ */
+void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+ enum dma_data_direction dir);
+
+/**
+ * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist.
+ * @sg: scatterlist holding DMA buffers.
+ * @nents: number of scatterlist entries.
+ * @dir: one of possible dma_data_direction values.
+ *
+ * Depending on @direction, KMSAN:
+ * * checks the buffers in the scatterlist, if they are copied to device;
+ * * initializes the buffers, if they are copied from device;
+ * * does both, if this is a DMA_BIDIRECTIONAL transfer.
+ */
+void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir);
+
+/**
+ * kmsan_handle_urb() - Handle a USB data transfer.
+ * @urb: struct urb pointer.
+ * @is_out: data transfer direction (true means output to hardware).
+ *
+ * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise,
+ * KMSAN initializes the transfer buffer.
+ */
+void kmsan_handle_urb(const struct urb *urb, bool is_out);
+
+/**
+ * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code.
+ * @regs: struct pt_regs pointer received from assembly code.
+ *
+ * KMSAN unpoisons the contents of the passed pt_regs, preventing potential
+ * false positive reports. Unlike kmsan_unpoison_memory(),
+ * kmsan_unpoison_entry_regs() can be called from the regions where
+ * kmsan_in_runtime() returns true, which is the case in early entry code.
+ */
+void kmsan_unpoison_entry_regs(const struct pt_regs *regs);
+
+#else
+
+static inline void kmsan_init_shadow(void)
+{
+}
+
+static inline void kmsan_init_runtime(void)
+{
+}
+
+static inline bool kmsan_memblock_free_pages(struct page *page,
+ unsigned int order)
+{
+ return true;
+}
+
+static inline void kmsan_task_create(struct task_struct *task)
+{
+}
+
+static inline void kmsan_task_exit(struct task_struct *task)
+{
+}
+
+static inline int kmsan_alloc_page(struct page *page, unsigned int order,
+ gfp_t flags)
+{
+ return 0;
+}
+
+static inline void kmsan_free_page(struct page *page, unsigned int order)
+{
+}
+
+static inline void kmsan_copy_page_meta(struct page *dst, struct page *src)
+{
+}
+
+static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object,
+ gfp_t flags)
+{
+}
+
+static inline void kmsan_slab_free(struct kmem_cache *s, void *object)
+{
+}
+
+static inline void kmsan_kmalloc_large(const void *ptr, size_t size,
+ gfp_t flags)
+{
+}
+
+static inline void kmsan_kfree_large(const void *ptr)
+{
+}
+
+static inline void kmsan_vmap_pages_range_noflush(unsigned long start,
+ unsigned long end,
+ pgprot_t prot,
+ struct page **pages,
+ unsigned int page_shift)
+{
+}
+
+static inline void kmsan_vunmap_range_noflush(unsigned long start,
+ unsigned long end)
+{
+}
+
+static inline void kmsan_ioremap_page_range(unsigned long start,
+ unsigned long end,
+ phys_addr_t phys_addr,
+ pgprot_t prot,
+ unsigned int page_shift)
+{
+}
+
+static inline void kmsan_iounmap_page_range(unsigned long start,
+ unsigned long end)
+{
+}
+
+static inline void kmsan_handle_dma(struct page *page, size_t offset,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
+static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+}
+
+static inline void kmsan_handle_urb(const struct urb *urb, bool is_out)
+{
+}
+
+static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
+{
+}
+
+#endif
+
+#endif /* _LINUX_KMSAN_H */
diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h
new file mode 100644
index 000000000000..8bfa6c98176d
--- /dev/null
+++ b/include/linux/kmsan_types.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal header declaring types added by KMSAN to existing kernel structs.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+#ifndef _LINUX_KMSAN_TYPES_H
+#define _LINUX_KMSAN_TYPES_H
+
+/* These constants are defined in the MSan LLVM instrumentation pass. */
+#define KMSAN_RETVAL_SIZE 800
+#define KMSAN_PARAM_SIZE 800
+
+struct kmsan_context_state {
+ char param_tls[KMSAN_PARAM_SIZE];
+ char retval_tls[KMSAN_RETVAL_SIZE];
+ char va_arg_tls[KMSAN_PARAM_SIZE];
+ char va_arg_origin_tls[KMSAN_PARAM_SIZE];
+ u64 va_arg_overflow_size_tls;
+ char param_origin_tls[KMSAN_PARAM_SIZE];
+ u32 retval_origin_tls;
+};
+
+#undef KMSAN_PARAM_SIZE
+#undef KMSAN_RETVAL_SIZE
+
+struct kmsan_ctx {
+ struct kmsan_context_state cstate;
+ int kmsan_in_runtime;
+ bool allow_reporting;
+};
+
+#endif /* _LINUX_KMSAN_TYPES_H */
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 0b4f17418f64..7e232ba59b86 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -15,9 +15,6 @@
#include <linux/sched.h>
#include <linux/sched/coredump.h>
-struct stable_node;
-struct mem_cgroup;
-
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags);
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
new file mode 100644
index 000000000000..2effab72add1
--- /dev/null
+++ b/include/linux/maple_tree.h
@@ -0,0 +1,685 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _LINUX_MAPLE_TREE_H
+#define _LINUX_MAPLE_TREE_H
+/*
+ * Maple Tree - An RCU-safe adaptive tree for storing ranges
+ * Copyright (c) 2018-2022 Oracle
+ * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com>
+ * Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+/* #define CONFIG_MAPLE_RCU_DISABLED */
+/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */
+
+/*
+ * Allocated nodes are mutable until they have been inserted into the tree,
+ * at which time they cannot change their type until they have been removed
+ * from the tree and an RCU grace period has passed.
+ *
+ * Removed nodes have their ->parent set to point to themselves. RCU readers
+ * check ->parent before relying on the value that they loaded from the
+ * slots array. This lets us reuse the slots array for the RCU head.
+ *
+ * Nodes in the tree point to their parent unless bit 0 is set.
+ */
+#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
+/* 64bit sizes */
+#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1)
+#else
+/* 32bit sizes */
+#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */
+#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */
+#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */
+#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */
+#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2)
+#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */
+
+#define MAPLE_NODE_MASK 255UL
+
+/*
+ * The node->parent of the root node has bit 0 set and the rest of the pointer
+ * is a pointer to the tree itself. No more bits are available in this pointer
+ * (on m68k, the data structure may only be 2-byte aligned).
+ *
+ * Internal non-root nodes can only have maple_range_* nodes as parents. The
+ * parent pointer is 256B aligned like all other tree nodes. When storing a 32
+ * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an
+ * extra bit to store the offset. This extra bit comes from a reuse of the last
+ * bit in the node type. This is possible by using bit 1 to indicate if bit 2
+ * is part of the type or the slot.
+ *
+ * Once the type is decided, the decision of an allocation range type or a range
+ * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
+ * flag.
+ *
+ * Node types:
+ * 0x??1 = Root
+ * 0x?00 = 16 bit nodes
+ * 0x010 = 32 bit nodes
+ * 0x110 = 64 bit nodes
+ *
+ * Slot size and location in the parent pointer:
+ * type : slot location
+ * 0x??1 : Root
+ * 0x?00 : 16 bit values, type in 0-1, slot in 2-6
+ * 0x010 : 32 bit values, type in 0-2, slot in 3-6
+ * 0x110 : 64 bit values, type in 0-2, slot in 3-6
+ */
+
+/*
+ * This metadata is used to optimize the gap updating code and in reverse
+ * searching for gaps or any other code that needs to find the end of the data.
+ */
+struct maple_metadata {
+ unsigned char end;
+ unsigned char gap;
+};
+
+/*
+ * Leaf nodes do not store pointers to nodes, they store user data. Users may
+ * store almost any bit pattern. As noted above, the optimisation of storing an
+ * entry at 0 in the root pointer cannot be done for data which have the bottom
+ * two bits set to '10'. We also reserve values with the bottom two bits set to
+ * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs
+ * return errnos as a negative errno shifted right by two bits and the bottom
+ * two bits set to '10', and while choosing to store these values in the array
+ * is not an error, it may lead to confusion if you're testing for an error with
+ * mas_is_err().
+ *
+ * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
+ * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now.
+ *
+ * In regular B-Tree terms, pivots are called keys. The term pivot is used to
+ * indicate that the tree is specifying ranges, Pivots may appear in the
+ * subtree with an entry attached to the value whereas keys are unique to a
+ * specific position of a B-tree. Pivot values are inclusive of the slot with
+ * the same index.
+ */
+
+struct maple_range_64 {
+ struct maple_pnode *parent;
+ unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
+ union {
+ void __rcu *slot[MAPLE_RANGE64_SLOTS];
+ struct {
+ void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
+ struct maple_metadata meta;
+ };
+ };
+};
+
+/*
+ * At tree creation time, the user can specify that they're willing to trade off
+ * storing fewer entries in a tree in return for storing more information in
+ * each node.
+ *
+ * The maple tree supports recording the largest range of NULL entries available
+ * in this node, also called gaps. This optimises the tree for allocating a
+ * range.
+ */
+struct maple_arange_64 {
+ struct maple_pnode *parent;
+ unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
+ void __rcu *slot[MAPLE_ARANGE64_SLOTS];
+ unsigned long gap[MAPLE_ARANGE64_SLOTS];
+ struct maple_metadata meta;
+};
+
+struct maple_alloc {
+ unsigned long total;
+ unsigned char node_count;
+ unsigned int request_count;
+ struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
+};
+
+struct maple_topiary {
+ struct maple_pnode *parent;
+ struct maple_enode *next; /* Overlaps the pivot */
+};
+
+enum maple_type {
+ maple_dense,
+ maple_leaf_64,
+ maple_range_64,
+ maple_arange_64,
+};
+
+
+/**
+ * DOC: Maple tree flags
+ *
+ * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree
+ * * MT_FLAGS_USE_RCU - Operate in RCU mode
+ * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags
+ * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value
+ * * MT_FLAGS_LOCK_MASK - How the mt_lock is used
+ * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe
+ * * MT_FLAGS_LOCK_BH - Acquired bh-safe
+ * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used
+ *
+ * MAPLE_HEIGHT_MAX The largest height that can be stored
+ */
+#define MT_FLAGS_ALLOC_RANGE 0x01
+#define MT_FLAGS_USE_RCU 0x02
+#define MT_FLAGS_HEIGHT_OFFSET 0x02
+#define MT_FLAGS_HEIGHT_MASK 0x7C
+#define MT_FLAGS_LOCK_MASK 0x300
+#define MT_FLAGS_LOCK_IRQ 0x100
+#define MT_FLAGS_LOCK_BH 0x200
+#define MT_FLAGS_LOCK_EXTERN 0x300
+
+#define MAPLE_HEIGHT_MAX 31
+
+
+#define MAPLE_NODE_TYPE_MASK 0x0F
+#define MAPLE_NODE_TYPE_SHIFT 0x03
+
+#define MAPLE_RESERVED_RANGE 4096
+
+#ifdef CONFIG_LOCKDEP
+typedef struct lockdep_map *lockdep_map_p;
+#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock)
+#define mt_set_external_lock(mt, lock) \
+ (mt)->ma_external_lock = &(lock)->dep_map
+#else
+typedef struct { /* nothing */ } lockdep_map_p;
+#define mt_lock_is_held(mt) 1
+#define mt_set_external_lock(mt, lock) do { } while (0)
+#endif
+
+/*
+ * If the tree contains a single entry at index 0, it is usually stored in
+ * tree->ma_root. To optimise for the page cache, an entry which ends in '00',
+ * '01' or '11' is stored in the root, but an entry which ends in '10' will be
+ * stored in a node. Bits 3-6 are used to store enum maple_type.
+ *
+ * The flags are used both to store some immutable information about this tree
+ * (set at tree creation time) and dynamic information set under the spinlock.
+ *
+ * Another use of flags are to indicate global states of the tree. This is the
+ * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
+ * RCU mode. This mode was added to allow the tree to reuse nodes instead of
+ * re-allocating and RCU freeing nodes when there is a single user.
+ */
+struct maple_tree {
+ union {
+ spinlock_t ma_lock;
+ lockdep_map_p ma_external_lock;
+ };
+ void __rcu *ma_root;
+ unsigned int ma_flags;
+};
+
+/**
+ * MTREE_INIT() - Initialize a maple tree
+ * @name: The maple tree name
+ * @__flags: The maple tree flags
+ *
+ */
+#define MTREE_INIT(name, __flags) { \
+ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \
+ .ma_flags = __flags, \
+ .ma_root = NULL, \
+}
+
+/**
+ * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
+ * @name: The tree name
+ * @__flags: The maple tree flags
+ * @__lock: The external lock
+ */
+#ifdef CONFIG_LOCKDEP
+#define MTREE_INIT_EXT(name, __flags, __lock) { \
+ .ma_external_lock = &(__lock).dep_map, \
+ .ma_flags = (__flags), \
+ .ma_root = NULL, \
+}
+#else
+#define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags)
+#endif
+
+#define DEFINE_MTREE(name) \
+ struct maple_tree name = MTREE_INIT(name, 0)
+
+#define mtree_lock(mt) spin_lock((&(mt)->ma_lock))
+#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock))
+
+/*
+ * The Maple Tree squeezes various bits in at various points which aren't
+ * necessarily obvious. Usually, this is done by observing that pointers are
+ * N-byte aligned and thus the bottom log_2(N) bits are available for use. We
+ * don't use the high bits of pointers to store additional information because
+ * we don't know what bits are unused on any given architecture.
+ *
+ * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
+ * low bits for our own purposes. Nodes are currently of 4 types:
+ * 1. Single pointer (Range is 0-0)
+ * 2. Non-leaf Allocation Range nodes
+ * 3. Non-leaf Range nodes
+ * 4. Leaf Range nodes All nodes consist of a number of node slots,
+ * pivots, and a parent pointer.
+ */
+
+struct maple_node {
+ union {
+ struct {
+ struct maple_pnode *parent;
+ void __rcu *slot[MAPLE_NODE_SLOTS];
+ };
+ struct {
+ void *pad;
+ struct rcu_head rcu;
+ struct maple_enode *piv_parent;
+ unsigned char parent_slot;
+ enum maple_type type;
+ unsigned char slot_len;
+ unsigned int ma_flags;
+ };
+ struct maple_range_64 mr64;
+ struct maple_arange_64 ma64;
+ struct maple_alloc alloc;
+ };
+};
+
+/*
+ * More complicated stores can cause two nodes to become one or three and
+ * potentially alter the height of the tree. Either half of the tree may need
+ * to be rebalanced against the other. The ma_topiary struct is used to track
+ * which nodes have been 'cut' from the tree so that the change can be done
+ * safely at a later date. This is done to support RCU.
+ */
+struct ma_topiary {
+ struct maple_enode *head;
+ struct maple_enode *tail;
+ struct maple_tree *mtree;
+};
+
+void *mtree_load(struct maple_tree *mt, unsigned long index);
+
+int mtree_insert(struct maple_tree *mt, unsigned long index,
+ void *entry, gfp_t gfp);
+int mtree_insert_range(struct maple_tree *mt, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp);
+int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
+ void *entry, unsigned long size, unsigned long min,
+ unsigned long max, gfp_t gfp);
+int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
+ void *entry, unsigned long size, unsigned long min,
+ unsigned long max, gfp_t gfp);
+
+int mtree_store_range(struct maple_tree *mt, unsigned long first,
+ unsigned long last, void *entry, gfp_t gfp);
+int mtree_store(struct maple_tree *mt, unsigned long index,
+ void *entry, gfp_t gfp);
+void *mtree_erase(struct maple_tree *mt, unsigned long index);
+
+void mtree_destroy(struct maple_tree *mt);
+void __mt_destroy(struct maple_tree *mt);
+
+/**
+ * mtree_empty() - Determine if a tree has any present entries.
+ * @mt: Maple Tree.
+ *
+ * Context: Any context.
+ * Return: %true if the tree contains only NULL pointers.
+ */
+static inline bool mtree_empty(const struct maple_tree *mt)
+{
+ return mt->ma_root == NULL;
+}
+
+/* Advanced API */
+
+/*
+ * The maple state is defined in the struct ma_state and is used to keep track
+ * of information during operations, and even between operations when using the
+ * advanced API.
+ *
+ * If state->node has bit 0 set then it references a tree location which is not
+ * a node (eg the root). If bit 1 is set, the rest of the bits are a negative
+ * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the
+ * node type.
+ *
+ * state->alloc either has a request number of nodes or an allocated node. If
+ * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
+ * and the remaining bits are the value. If state->alloc is a node, then the
+ * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for
+ * storing more allocated nodes, a total number of nodes allocated, and the
+ * node_count in this node. node_count is the number of allocated nodes in this
+ * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
+ * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc
+ * by removing a node from the state->alloc node until state->alloc->node_count
+ * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
+ * to state->alloc. Nodes are pushed onto state->alloc by putting the current
+ * state->alloc into the pushed node's slot[0].
+ *
+ * The state also contains the implied min/max of the state->node, the depth of
+ * this search, and the offset. The implied min/max are either from the parent
+ * node or are 0-oo for the root node. The depth is incremented or decremented
+ * every time a node is walked down or up. The offset is the slot/pivot of
+ * interest in the node - either for reading or writing.
+ *
+ * When returning a value the maple state index and last respectively contain
+ * the start and end of the range for the entry. Ranges are inclusive in the
+ * Maple Tree.
+ */
+struct ma_state {
+ struct maple_tree *tree; /* The tree we're operating in */
+ unsigned long index; /* The index we're operating on - range start */
+ unsigned long last; /* The last index we're operating on - range end */
+ struct maple_enode *node; /* The node containing this entry */
+ unsigned long min; /* The minimum index of this node - implied pivot min */
+ unsigned long max; /* The maximum index of this node - implied pivot max */
+ struct maple_alloc *alloc; /* Allocated nodes for this operation */
+ unsigned char depth; /* depth of tree descent during write */
+ unsigned char offset;
+ unsigned char mas_flags;
+};
+
+struct ma_wr_state {
+ struct ma_state *mas;
+ struct maple_node *node; /* Decoded mas->node */
+ unsigned long r_min; /* range min */
+ unsigned long r_max; /* range max */
+ enum maple_type type; /* mas->node type */
+ unsigned char offset_end; /* The offset where the write ends */
+ unsigned char node_end; /* mas->node end */
+ unsigned long *pivots; /* mas->node->pivots pointer */
+ unsigned long end_piv; /* The pivot at the offset end */
+ void __rcu **slots; /* mas->node->slots pointer */
+ void *entry; /* The entry to write */
+ void *content; /* The existing entry that is being overwritten */
+};
+
+#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock))
+#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock))
+
+
+/*
+ * Special values for ma_state.node.
+ * MAS_START means we have not searched the tree.
+ * MAS_ROOT means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * MAS_NONE means we have searched the tree and there is no node in the
+ * tree for this entry. For example, we searched for index 1 in an empty
+ * tree. Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * MA_ERROR represents an errno. After dropping the lock and attempting
+ * to resolve the error, the walk would have to be restarted from the
+ * top of the tree as the tree may have been modified.
+ */
+#define MAS_START ((struct maple_enode *)1UL)
+#define MAS_ROOT ((struct maple_enode *)5UL)
+#define MAS_NONE ((struct maple_enode *)9UL)
+#define MAS_PAUSE ((struct maple_enode *)17UL)
+#define MA_ERROR(err) \
+ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
+
+#define MA_STATE(name, mt, first, end) \
+ struct ma_state name = { \
+ .tree = mt, \
+ .index = first, \
+ .last = end, \
+ .node = MAS_START, \
+ .min = 0, \
+ .max = ULONG_MAX, \
+ .alloc = NULL, \
+ }
+
+#define MA_WR_STATE(name, ma_state, wr_entry) \
+ struct ma_wr_state name = { \
+ .mas = ma_state, \
+ .content = NULL, \
+ .entry = wr_entry, \
+ }
+
+#define MA_TOPIARY(name, tree) \
+ struct ma_topiary name = { \
+ .head = NULL, \
+ .tail = NULL, \
+ .mtree = tree, \
+ }
+
+void *mas_walk(struct ma_state *mas);
+void *mas_store(struct ma_state *mas, void *entry);
+void *mas_erase(struct ma_state *mas);
+int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
+void mas_store_prealloc(struct ma_state *mas, void *entry);
+void *mas_find(struct ma_state *mas, unsigned long max);
+void *mas_find_rev(struct ma_state *mas, unsigned long min);
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
+bool mas_is_err(struct ma_state *mas);
+
+bool mas_nomem(struct ma_state *mas, gfp_t gfp);
+void mas_pause(struct ma_state *mas);
+void maple_tree_init(void);
+void mas_destroy(struct ma_state *mas);
+int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);
+
+void *mas_prev(struct ma_state *mas, unsigned long min);
+void *mas_next(struct ma_state *mas, unsigned long max);
+
+int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
+ unsigned long size);
+
+/* Checks if a mas has not found anything */
+static inline bool mas_is_none(struct ma_state *mas)
+{
+ return mas->node == MAS_NONE;
+}
+
+/* Checks if a mas has been paused */
+static inline bool mas_is_paused(struct ma_state *mas)
+{
+ return mas->node == MAS_PAUSE;
+}
+
+void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas);
+void mas_dup_store(struct ma_state *mas, void *entry);
+
+/*
+ * This finds an empty area from the highest address to the lowest.
+ * AKA "Topdown" version,
+ */
+int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
+ unsigned long max, unsigned long size);
+/**
+ * mas_reset() - Reset a Maple Tree operation state.
+ * @mas: Maple Tree operation state.
+ *
+ * Resets the error or walk state of the @mas so future walks of the
+ * array will start from the root. Use this if you have dropped the
+ * lock and want to reuse the ma_state.
+ *
+ * Context: Any context.
+ */
+static inline void mas_reset(struct ma_state *mas)
+{
+ mas->node = MAS_START;
+}
+
+/**
+ * mas_for_each() - Iterate over a range of the maple tree.
+ * @__mas: Maple Tree operation state (maple_state)
+ * @__entry: Entry retrieved from the tree
+ * @__max: maximum index to retrieve from the tree
+ *
+ * When returned, mas->index and mas->last will hold the entire range for the
+ * entry.
+ *
+ * Note: may return the zero entry.
+ *
+ */
+#define mas_for_each(__mas, __entry, __max) \
+ while (((__entry) = mas_find((__mas), (__max))) != NULL)
+
+
+/**
+ * mas_set_range() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * Move the operation state to refer to a different range. This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline
+void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
+{
+ mas->index = start;
+ mas->last = last;
+ mas->node = MAS_START;
+}
+
+/**
+ * mas_set() - Set up Maple Tree operation state for a different index.
+ * @mas: Maple Tree operation state.
+ * @index: New index into the Maple Tree.
+ *
+ * Move the operation state to refer to a different index. This will
+ * have the effect of starting a walk from the top; see mas_next()
+ * to move to an adjacent index.
+ */
+static inline void mas_set(struct ma_state *mas, unsigned long index)
+{
+
+ mas_set_range(mas, index, index);
+}
+
+static inline bool mt_external_lock(const struct maple_tree *mt)
+{
+ return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
+}
+
+/**
+ * mt_init_flags() - Initialise an empty maple tree with flags.
+ * @mt: Maple Tree
+ * @flags: maple tree flags.
+ *
+ * If you need to initialise a Maple Tree with special flags (eg, an
+ * allocation tree), use this function.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
+{
+ mt->ma_flags = flags;
+ if (!mt_external_lock(mt))
+ spin_lock_init(&mt->ma_lock);
+ rcu_assign_pointer(mt->ma_root, NULL);
+}
+
+/**
+ * mt_init() - Initialise an empty maple tree.
+ * @mt: Maple Tree
+ *
+ * An empty Maple Tree.
+ *
+ * Context: Any context.
+ */
+static inline void mt_init(struct maple_tree *mt)
+{
+ mt_init_flags(mt, 0);
+}
+
+static inline bool mt_in_rcu(struct maple_tree *mt)
+{
+#ifdef CONFIG_MAPLE_RCU_DISABLED
+ return false;
+#endif
+ return mt->ma_flags & MT_FLAGS_USE_RCU;
+}
+
+/**
+ * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_clear_in_rcu(struct maple_tree *mt)
+{
+ if (!mt_in_rcu(mt))
+ return;
+
+ if (mt_external_lock(mt)) {
+ BUG_ON(!mt_lock_is_held(mt));
+ mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+ } else {
+ mtree_lock(mt);
+ mt->ma_flags &= ~MT_FLAGS_USE_RCU;
+ mtree_unlock(mt);
+ }
+}
+
+/**
+ * mt_set_in_rcu() - Switch the tree to RCU safe mode.
+ * @mt: The Maple Tree
+ */
+static inline void mt_set_in_rcu(struct maple_tree *mt)
+{
+ if (mt_in_rcu(mt))
+ return;
+
+ if (mt_external_lock(mt)) {
+ BUG_ON(!mt_lock_is_held(mt));
+ mt->ma_flags |= MT_FLAGS_USE_RCU;
+ } else {
+ mtree_lock(mt);
+ mt->ma_flags |= MT_FLAGS_USE_RCU;
+ mtree_unlock(mt);
+ }
+}
+
+void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
+void *mt_find_after(struct maple_tree *mt, unsigned long *index,
+ unsigned long max);
+void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min);
+void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
+
+/**
+ * mt_for_each - Iterate over each entry starting at index until max.
+ * @__tree: The Maple Tree
+ * @__entry: The current entry
+ * @__index: The index to update to track the location in the tree
+ * @__max: The maximum limit for @index
+ *
+ * Note: Will not return the zero entry.
+ */
+#define mt_for_each(__tree, __entry, __index, __max) \
+ for (__entry = mt_find(__tree, &(__index), __max); \
+ __entry; __entry = mt_find_after(__tree, &(__index), __max))
+
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt);
+void mt_validate(struct maple_tree *mt);
+#define MT_BUG_ON(__tree, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mt_dump(__tree); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+#else
+#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
+#endif /*_LINUX_MAPLE_TREE_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 567f12323f55..e1644a24009c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -80,29 +80,8 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
};
-struct memcg_vmstats_percpu {
- /* Local (CPU and cgroup) page state & events */
- long state[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
-
- /* Delta calculation for lockless upward propagation */
- long state_prev[MEMCG_NR_STAT];
- unsigned long events_prev[NR_VM_EVENT_ITEMS];
-
- /* Cgroup1: threshold notifications & softlimit tree updates */
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct memcg_vmstats {
- /* Aggregated (CPU and subtree) page state & events */
- long state[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
-
- /* Pending child counts during tree propagation */
- long state_pending[MEMCG_NR_STAT];
- unsigned long events_pending[NR_VM_EVENT_ITEMS];
-};
+struct memcg_vmstats_percpu;
+struct memcg_vmstats;
struct mem_cgroup_reclaim_iter {
struct mem_cgroup *position;
@@ -185,15 +164,6 @@ struct mem_cgroup_thresholds {
struct mem_cgroup_threshold_ary *spare;
};
-#if defined(CONFIG_SMP)
-struct memcg_padding {
- char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define MEMCG_PADDING(name) struct memcg_padding name
-#else
-#define MEMCG_PADDING(name)
-#endif
-
/*
* Remember four most recent foreign writebacks with dirty pages in this
* cgroup. Inode sharing is expected to be uncommon and, even if we miss
@@ -304,10 +274,10 @@ struct mem_cgroup {
spinlock_t move_lock;
unsigned long move_lock_flags;
- MEMCG_PADDING(_pad1_);
+ CACHELINE_PADDING(_pad1_);
/* memory.stat */
- struct memcg_vmstats vmstats;
+ struct memcg_vmstats *vmstats;
/* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -326,7 +296,7 @@ struct mem_cgroup {
struct list_head objcg_list;
#endif
- MEMCG_PADDING(_pad2_);
+ CACHELINE_PADDING(_pad2_);
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
@@ -350,14 +320,20 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* per-memcg mm_struct list */
+ struct lru_gen_mm_list mm_list;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[];
};
/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
+ * size of first charge trial.
+ * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
+ * workload.
*/
-#define MEMCG_CHARGE_BATCH 32U
+#define MEMCG_CHARGE_BATCH 64U
extern struct mem_cgroup *root_mem_cgroup;
@@ -444,6 +420,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -505,6 +482,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem page a caller should hold an rcu read lock to protect memcg
* associated with a kmem page from being released.
@@ -689,7 +667,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
return __mem_cgroup_charge(folio, mm, gfp);
}
-int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -959,6 +937,23 @@ void unlock_page_memcg(struct page *page);
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+ return true;
+
+ rcu_read_unlock();
+ return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -985,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page,
rcu_read_unlock();
}
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
- long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
enum node_stat_item idx)
@@ -1238,7 +1225,7 @@ static inline int mem_cgroup_charge(struct folio *folio,
return 0;
}
-static inline int mem_cgroup_swapin_charge_page(struct page *page,
+static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
return 0;
@@ -1433,6 +1420,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
{
}
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ /* to match folio_memcg_rcu() */
+ rcu_read_lock();
+ return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
static inline void mem_cgroup_handle_over_high(void)
{
}
@@ -1779,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
{
struct mem_cgroup *memcg;
- if (mem_cgroup_kmem_disabled())
+ if (!memcg_kmem_enabled())
return;
rcu_read_lock();
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
new file mode 100644
index 000000000000..965009aa01d7
--- /dev/null
+++ b/include/linux/memory-tiers.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_TIERS_H
+#define _LINUX_MEMORY_TIERS_H
+
+#include <linux/types.h>
+#include <linux/nodemask.h>
+#include <linux/kref.h>
+#include <linux/mmzone.h>
+/*
+ * Each tier cover a abstrace distance chunk size of 128
+ */
+#define MEMTIER_CHUNK_BITS 7
+#define MEMTIER_CHUNK_SIZE (1 << MEMTIER_CHUNK_BITS)
+/*
+ * Smaller abstract distance values imply faster (higher) memory tiers. Offset
+ * the DRAM adistance so that we can accommodate devices with a slightly lower
+ * adistance value (slightly faster) than default DRAM adistance to be part of
+ * the same memory tier.
+ */
+#define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
+#define MEMTIER_HOTPLUG_PRIO 100
+
+struct memory_tier;
+struct memory_dev_type {
+ /* list of memory types that are part of same tier as this type */
+ struct list_head tier_sibiling;
+ /* abstract distance for this specific memory type */
+ int adistance;
+ /* Nodes of same abstract distance */
+ nodemask_t nodes;
+ struct kref kref;
+};
+
+#ifdef CONFIG_NUMA
+extern bool numa_demotion_enabled;
+struct memory_dev_type *alloc_memory_type(int adistance);
+void destroy_memory_type(struct memory_dev_type *memtype);
+void init_node_memory_type(int node, struct memory_dev_type *default_type);
+void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+#ifdef CONFIG_MIGRATION
+int next_demotion_node(int node);
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
+bool node_is_toptier(int node);
+#else
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ *targets = NODE_MASK_NONE;
+}
+
+static inline bool node_is_toptier(int node)
+{
+ return true;
+}
+#endif
+
+#else
+
+#define numa_demotion_enabled false
+/*
+ * CONFIG_NUMA implementation returns non NULL error.
+ */
+static inline struct memory_dev_type *alloc_memory_type(int adistance)
+{
+ return NULL;
+}
+
+static inline void destroy_memory_type(struct memory_dev_type *memtype)
+{
+
+}
+
+static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
+{
+
+}
+
+static inline void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+
+}
+
+static inline int next_demotion_node(int node)
+{
+ return NUMA_NO_NODE;
+}
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ *targets = NODE_MASK_NONE;
+}
+
+static inline bool node_is_toptier(int node)
+{
+ return true;
+}
+#endif /* CONFIG_NUMA */
+#endif /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e0b2209ab71c..9fcbf5706595 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -11,7 +11,6 @@ struct page;
struct zone;
struct pglist_data;
struct mem_section;
-struct memory_block;
struct memory_group;
struct resource;
struct vmem_altmap;
@@ -44,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
({ \
memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \
})
-/*
- * This definition is just for error path in node hotadd.
- * For node hotremove, we have to replace this.
- */
-#define generic_free_nodedata(pgdat) kfree(pgdat)
extern pg_data_t *node_data[];
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
@@ -64,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid)
BUG();
return NULL;
}
-static inline void generic_free_nodedata(pg_data_t *pgdat)
-{
-}
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
{
}
@@ -216,6 +207,22 @@ void put_online_mems(void);
void mem_hotplug_begin(void);
void mem_hotplug_done(void);
+/* See kswapd_is_running() */
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat)
+{
+ mutex_lock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat)
+{
+ mutex_unlock(&pgdat->kswapd_lock);
+}
+
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat)
+{
+ mutex_init(&pgdat->kswapd_lock);
+}
+
#else /* ! CONFIG_MEMORY_HOTPLUG */
#define pfn_to_online_page(pfn) \
({ \
@@ -252,6 +259,10 @@ static inline bool movable_node_is_enabled(void)
{
return false;
}
+
+static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
+static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
#endif /* ! CONFIG_MEMORY_HOTPLUG */
/*
@@ -333,7 +344,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
extern void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages);
-extern bool is_memblock_offlined(struct memory_block *mem);
extern int sparse_add_section(int nid, unsigned long pfn,
unsigned long nr_pages, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..d232de7cdc56 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
- struct mempolicy *mpol = get_task_policy(current);
-
- return policy_nodemask(gfp, mpol);
-}
-
extern unsigned int mempolicy_slab_node(void);
extern enum zone_type policy_zone;
@@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
return (pol->mode == MPOL_PREFERRED_MANY);
}
+extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
#else
@@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task)
{
}
-static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
-{
- return NULL;
-}
-
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
return false;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 22c0a0cf5e0c..704a04f5a074 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -100,21 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#endif /* CONFIG_MIGRATION */
-#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
-extern void set_migration_target_nodes(void);
-extern void migrate_on_reclaim_init(void);
-extern bool numa_demotion_enabled;
-extern int next_demotion_node(int node);
-#else
-static inline void set_migration_target_nodes(void) {}
-static inline void migrate_on_reclaim_init(void) {}
-static inline int next_demotion_node(int node)
-{
- return NUMA_NO_NODE;
-}
-#define numa_demotion_enabled false
-#endif
-
#ifdef CONFIG_COMPACTION
bool PageMovable(struct page *page);
void __SetPageMovable(struct page *page, const struct movable_operations *ops);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..8bbcccbc5565 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -661,6 +661,38 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+ return mas_find(&vmi->mas, max);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+ /*
+ * Uses vma_find() to get the first VMA when the iterator starts.
+ * Calling mas_next() could skip the first entry.
+ */
+ return vma_find(vmi, ULONG_MAX);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+ return mas_prev(&vmi->mas, 0);
+}
+
+static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
+{
+ return vmi->mas.index;
+}
+
+#define for_each_vma(__vmi, __vma) \
+ while (((__vma) = vma_next(&(__vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(__vmi, __vma, __end) \
+ while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL)
+
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
@@ -697,7 +729,9 @@ static inline unsigned int compound_order(struct page *page)
*/
static inline unsigned int folio_order(struct folio *folio)
{
- return compound_order(&folio->page);
+ if (!folio_test_large(folio))
+ return 0;
+ return folio->_folio_order;
}
#include <linux/huge_mm.h>
@@ -1255,6 +1289,18 @@ static inline int folio_nid(const struct folio *folio)
}
#ifdef CONFIG_NUMA_BALANCING
+/* page access time bits needs to hold at least 4 seconds */
+#define PAGE_ACCESS_TIME_MIN_BITS 12
+#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
+#define PAGE_ACCESS_TIME_BUCKETS \
+ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
+#else
+#define PAGE_ACCESS_TIME_BUCKETS 0
+#endif
+
+#define PAGE_ACCESS_TIME_MASK \
+ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)
+
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
@@ -1318,12 +1364,25 @@ static inline void page_cpupid_reset_last(struct page *page)
page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
+
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+ int last_time;
+
+ last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+ return last_time << PAGE_ACCESS_TIME_BUCKETS;
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
return page_to_nid(page); /* XXX */
}
+static inline int xchg_page_access_time(struct page *page, int time)
+{
+ return 0;
+}
+
static inline int page_cpupid_last(struct page *page)
{
return page_to_nid(page); /* XXX */
@@ -1465,6 +1524,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
return page_to_pfn(&folio->page);
}
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+ return page_folio(pfn_to_page(pfn));
+}
+
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
{
return &folio_page(folio, 1)->compound_pincount;
@@ -1597,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
*/
static inline long folio_nr_pages(struct folio *folio)
{
- return compound_nr(&folio->page);
+ if (!folio_test_large(folio))
+ return 1;
+#ifdef CONFIG_64BIT
+ return folio->_folio_nr_pages;
+#else
+ return 1L << folio->_folio_order;
+#endif
}
/**
@@ -1776,7 +1846,11 @@ extern void pagefault_out_of_memory(void);
*/
#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
-extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
+extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
+static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
+{
+ __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
+}
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
@@ -1795,8 +1869,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
-void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long start, unsigned long end);
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long start,
+ unsigned long end);
struct mmu_notifier_range;
@@ -2495,7 +2570,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
-extern unsigned long find_min_pfn_with_active_regions(void);
#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
@@ -2516,7 +2590,12 @@ extern void calculate_min_free_kbytes(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
-extern void show_mem(unsigned int flags, nodemask_t *nodemask);
+
+extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
+static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+{
+ __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
@@ -2593,14 +2672,15 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
-extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
- struct rb_node **, struct rb_node *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
+
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
unsigned long start,
@@ -2648,8 +2728,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
-extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
- struct list_head *uf, bool downgrade);
+extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
@@ -2716,26 +2797,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
-/**
- * find_vma_intersection() - Look up the first VMA which intersects the interval
- * @mm: The process address space.
- * @start_addr: The inclusive start user address.
- * @end_addr: The exclusive end user address.
- *
- * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
- * start_addr < end_addr.
+/*
+ * Look up the first VMA which intersects the interval [start_addr, end_addr)
+ * NULL if none. Assume start_addr < end_addr.
*/
-static inline
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
- unsigned long start_addr,
- unsigned long end_addr)
-{
- struct vm_area_struct *vma = find_vma(mm, start_addr);
-
- if (vma && end_addr <= vma->vm_start)
- vma = NULL;
- return vma;
-}
+ unsigned long start_addr, unsigned long end_addr);
/**
* vma_lookup() - Find a VMA at a specific address
@@ -2747,12 +2814,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = find_vma(mm, addr);
-
- if (vma && addr < vma->vm_start)
- vma = NULL;
-
- return vma;
+ return mtree_load(&mm->mm_mt, addr);
}
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
@@ -2788,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
{
- struct vm_area_struct *vma = find_vma(mm, vm_start);
+ struct vm_area_struct *vma = vma_lookup(mm, vm_start);
if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
vma = NULL;
@@ -2888,7 +2950,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
* and return without waiting upon it */
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
-#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
@@ -2975,8 +3036,8 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
* PageAnonExclusive() has to protect against concurrent GUP:
* * Ordinary GUP: Using the PT lock
* * GUP-fast and fork(): mm->write_protect_seq
- * * GUP-fast and KSM or temporary unmapping (swap, migration):
- * clear/invalidate+flush of the page table entry
+ * * GUP-fast and KSM or temporary unmapping (swap, migration): see
+ * page_try_share_anon_rmap()
*
* Must be called with the (sub)page that's actually referenced via the
* page table entry, which might not necessarily be the head page for a
@@ -2997,6 +3058,11 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
*/
if (!PageAnon(page))
return false;
+
+ /* Paired with a memory barrier in page_try_share_anon_rmap(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_rmb();
+
/*
* Note that PageKsm() pages cannot be exclusive, and consequently,
* cannot get pinned.
@@ -3004,6 +3070,21 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page)
return !PageAnonExclusive(page);
}
+/*
+ * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
+ * a (NUMA hinting) fault is required.
+ */
+static inline bool gup_can_follow_protnone(unsigned int flags)
+{
+ /*
+ * FOLL_FORCE has to be able to make progress even if the VMA is
+ * inaccessible. Further, FOLL_FORCE access usually does not represent
+ * application behaviour and we should avoid triggering NUMA hinting
+ * faults.
+ */
+ return flags & FOLL_FORCE;
+}
+
typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
unsigned long size, pte_fn_t fn, void *data);
@@ -3011,7 +3092,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
unsigned long address, unsigned long size,
pte_fn_t fn, void *data);
-extern void init_mem_debugging_and_hardening(void);
+extern void __init init_mem_debugging_and_hardening(void);
#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7b25b53c474a..e8ed225d8f7c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -34,15 +34,25 @@ static inline int page_is_file_lru(struct page *page)
return folio_is_file_lru(page_folio(page));
}
-static __always_inline void update_lru_size(struct lruvec *lruvec,
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
long nr_pages)
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ lockdep_assert_held(&lruvec->lru_lock);
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
+
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, enum zone_type zid,
+ long nr_pages)
+{
+ __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
@@ -66,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio)
__folio_clear_unevictable(folio);
}
-static __always_inline void __clear_page_lru_flags(struct page *page)
-{
- __folio_clear_lru_flags(page_folio(page));
-}
-
/**
* folio_lru_list - Which LRU list should a folio be on?
* @folio: The folio to test.
@@ -94,11 +99,224 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#else
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#endif
+
+static inline bool lru_gen_in_fault(void)
+{
+ return current->in_lru_fault;
+}
+
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+static inline int lru_hist_from_seq(unsigned long seq)
+{
+ return seq % NR_HIST_GENS;
+}
+
+static inline int lru_tier_from_refs(int refs)
+{
+ VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
+
+ /* see the comment in folio_lru_refs() */
+ return order_base_2(refs + 1);
+}
+
+static inline int folio_lru_refs(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+ bool workingset = flags & BIT(PG_workingset);
+
+ /*
+ * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
+ * total number of accesses is N>1, since N=0,1 both map to the first
+ * tier. lru_tier_from_refs() will account for this off-by-one. Also see
+ * the comment on MAX_NR_TIERS.
+ */
+ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
+}
+
+static inline int folio_lru_gen(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = lruvec->lrugen.max_seq;
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+
+ /* see the comment on MIN_NR_GENS */
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
+ lrugen->nr_pages[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
+ lrugen->nr_pages[new_gen][type][zone] + delta);
+
+ /* addition */
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ /* deletion */
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+
+ /* promotion */
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+ __update_lru_size(lruvec, lru, zone, -delta);
+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+ }
+
+ /* demotion requires isolation, e.g., lru_deactivate_fn() */
+ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long seq;
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
+ return false;
+ /*
+ * There are three common cases for this page:
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
+ * migrated, add it to the youngest generation.
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+ * not in swapcache or a dirty page pending writeback, add it to the
+ * second oldest generation.
+ * 3. Everything else (clean, cold) is added to the oldest generation.
+ */
+ if (folio_test_active(folio))
+ seq = lrugen->max_seq;
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ seq = lrugen->min_seq[type] + 1;
+ else
+ seq = lrugen->min_seq[type];
+
+ gen = lru_gen_from_seq(seq);
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
+ /* see the comment on MIN_NR_GENS about PG_active */
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+ lru_gen_update_size(lruvec, folio, -1, gen);
+ /* for folio_rotate_reclaimable() */
+ if (reclaiming)
+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+
+ if (gen < 0)
+ return false;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+
+ /* for folio_migrate_flags() */
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ lru_gen_update_size(lruvec, folio, gen, -1);
+ list_del(&folio->lru);
+
+ return true;
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, false))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
if (lru != LRU_UNEVICTABLE)
@@ -116,23 +334,23 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, true))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
/* This is not expected to be used on LRU_UNEVICTABLE */
list_add_tail(&folio->lru, &lruvec->lists[lru]);
}
-static __always_inline void add_page_to_lru_list_tail(struct page *page,
- struct lruvec *lruvec)
-{
- lruvec_add_folio_tail(lruvec, page_folio(page));
-}
-
static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_del_folio(lruvec, folio, false))
+ return;
+
if (lru != LRU_UNEVICTABLE)
list_del(&folio->lru);
update_lru_size(lruvec, lru, folio_zonenum(folio),
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index cf97f3884fda..500e536796ca 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
@@ -223,6 +224,18 @@ struct page {
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_KMSAN
+ /*
+ * KMSAN metadata for this page:
+ * - shadow page: every bit indicates whether the corresponding
+ * bit of the original page is initialized (0) or not (1);
+ * - origin page: every 4 bytes contain an id of the stack trace
+ * where the uninitialized value was created.
+ */
+ struct page *kmsan_shadow;
+ struct page *kmsan_origin;
+#endif
+
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
@@ -244,6 +257,13 @@ struct page {
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
+ * @_flags_1: For large folios, additional page flags.
+ * @__head: Points to the folio. Do not use.
+ * @_folio_dtor: Which destructor to use for this folio.
+ * @_folio_order: Do not use directly, call folio_order().
+ * @_total_mapcount: Do not use directly, call folio_entire_mapcount().
+ * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
+ * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
*
* A folio is a physically, virtually and logically contiguous set
* of bytes. It is a power-of-two in size, and it is aligned to that
@@ -282,9 +302,17 @@ struct folio {
};
struct page page;
};
+ unsigned long _flags_1;
+ unsigned long __head;
+ unsigned char _folio_dtor;
+ unsigned char _folio_order;
+ atomic_t _total_mapcount;
+ atomic_t _pincount;
+#ifdef CONFIG_64BIT
+ unsigned int _folio_nr_pages;
+#endif
};
-static_assert(sizeof(struct page) == sizeof(struct folio));
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
FOLIO_MATCH(flags, flags);
@@ -299,6 +327,19 @@ FOLIO_MATCH(_refcount, _refcount);
FOLIO_MATCH(memcg_data, memcg_data);
#endif
#undef FOLIO_MATCH
+#define FOLIO_MATCH(pg, fl) \
+ static_assert(offsetof(struct folio, fl) == \
+ offsetof(struct page, pg) + sizeof(struct page))
+FOLIO_MATCH(flags, _flags_1);
+FOLIO_MATCH(compound_head, __head);
+FOLIO_MATCH(compound_dtor, _folio_dtor);
+FOLIO_MATCH(compound_order, _folio_order);
+FOLIO_MATCH(compound_mapcount, _total_mapcount);
+FOLIO_MATCH(compound_pincount, _pincount);
+#ifdef CONFIG_64BIT
+FOLIO_MATCH(compound_nr, _folio_nr_pages);
+#endif
+#undef FOLIO_MATCH
static inline atomic_t *folio_mapcount_ptr(struct folio *folio)
{
@@ -407,21 +448,6 @@ struct vm_area_struct {
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
- /* linked list of VM areas per task, sorted by address */
- struct vm_area_struct *vm_next, *vm_prev;
-
- struct rb_node vm_rb;
-
- /*
- * Largest free memory gap in bytes to the left of this VMA.
- * Either between this VMA and vma->vm_prev, or between one of the
- * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
- * get_unmapped_area find a free area of the right size.
- */
- unsigned long rb_subtree_gap;
-
- /* Second cache line starts here. */
-
struct mm_struct *vm_mm; /* The address space we belong to. */
/*
@@ -485,9 +511,7 @@ struct vm_area_struct {
struct kioctx_table;
struct mm_struct {
struct {
- struct vm_area_struct *mmap; /* list of VMAs */
- struct rb_root mm_rb;
- u64 vmacache_seqnum; /* per-thread vmacache */
+ struct maple_tree mm_mt;
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
@@ -501,7 +525,6 @@ struct mm_struct {
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
- unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
#ifdef CONFIG_MEMBARRIER
@@ -631,22 +654,22 @@ struct mm_struct {
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
- * numa_next_scan is the next time that the PTEs will be marked
- * pte_numa. NUMA hinting faults will gather statistics and
- * migrate pages to new nodes if necessary.
+ * numa_next_scan is the next time that PTEs will be remapped
+ * PROT_NONE to trigger NUMA hinting faults; such faults gather
+ * statistics and migrate pages to new nodes if necessary.
*/
unsigned long numa_next_scan;
- /* Restart point for scanning and setting pte_numa */
+ /* Restart point for scanning and remapping PTEs. */
unsigned long numa_scan_offset;
- /* numa_scan_seq prevents two threads setting pte_numa */
+ /* numa_scan_seq prevents two threads remapping PTEs. */
int numa_scan_seq;
#endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
- * moving a PROT_NONE or PROT_NUMA mapped page.
+ * moving a PROT_NONE mapped page.
*/
atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
@@ -671,7 +694,28 @@ struct mm_struct {
* merging.
*/
unsigned long ksm_merging_pages;
+ /*
+ * Represent how many pages are checked for ksm merging
+ * including merged and not merged.
+ */
+ unsigned long ksm_rmap_items;
+#endif
+#ifdef CONFIG_LRU_GEN
+ struct {
+ /* this mm_struct is on lru_gen_mm_list */
+ struct list_head list;
+ /*
+ * Set when switching to this mm_struct, as a hint of
+ * whether it has been used since the last time per-node
+ * page table walkers cleared the corresponding bits.
+ */
+ unsigned long bitmap;
+#ifdef CONFIG_MEMCG
+ /* points to the memcg of "owner" above */
+ struct mem_cgroup *memcg;
#endif
+ } lru_gen;
+#endif /* CONFIG_LRU_GEN */
} __randomize_layout;
/*
@@ -681,6 +725,7 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
+#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
@@ -698,6 +743,87 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+ /* mm_struct list for page table walkers */
+ struct list_head fifo;
+ /* protects the list above */
+ spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD(&mm->lru_gen.list);
+ mm->lru_gen.bitmap = 0;
+#ifdef CONFIG_MEMCG
+ mm->lru_gen.memcg = NULL;
+#endif
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+ /*
+ * When the bitmap is set, page reclaim knows this mm_struct has been
+ * used since the last time it cleared the bitmap. So it might be worth
+ * walking the page tables of this mm_struct to clear the accessed bit.
+ */
+ WRITE_ONCE(mm->lru_gen.bitmap, -1);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
+struct vma_iterator {
+ struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, __mm, __addr) \
+ struct vma_iterator name = { \
+ .mas = { \
+ .tree = &(__mm)->mm_mt, \
+ .index = __addr, \
+ .node = MAS_START, \
+ }, \
+ }
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+ struct mm_struct *mm, unsigned long addr)
+{
+ vmi->mas.tree = &mm->mm_mt;
+ vmi->mas.index = addr;
+ vmi->mas.node = MAS_START;
+}
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..0bb4b6da9993 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -25,18 +25,6 @@
#define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8)
/*
- * The per task VMA cache array:
- */
-#define VMACACHE_BITS 2
-#define VMACACHE_SIZE (1U << VMACACHE_BITS)
-#define VMACACHE_MASK (VMACACHE_SIZE - 1)
-
-struct vmacache {
- u64 seqnum;
- struct vm_area_struct *vmas[VMACACHE_SIZE];
-};
-
-/*
* When updating this, please also update struct resident_page_types[] in
* kernel/fork.c
*/
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 355d842d2731..5f74891556f3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,10 +24,10 @@
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
-#ifndef CONFIG_FORCE_MAX_ZONEORDER
+#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_ORDER 11
#else
-#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
+#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
@@ -121,20 +121,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
struct pglist_data;
-/*
- * Add a wild amount of padding here to ensure data fall into separate
- * cachelines. There are very few zone structures in the machine, so space
- * consumption is not a concern here.
- */
-#if defined(CONFIG_SMP)
-struct zone_padding {
- char x[0];
-} ____cacheline_internodealigned_in_smp;
-#define ZONE_PADDING(name) struct zone_padding name;
-#else
-#define ZONE_PADDING(name)
-#endif
-
#ifdef CONFIG_NUMA
enum numa_stat_item {
NUMA_HIT, /* allocated in intended node */
@@ -222,6 +208,7 @@ enum node_stat_item {
#endif
#ifdef CONFIG_NUMA_BALANCING
PGPROMOTE_SUCCESS, /* promote successfully */
+ PGPROMOTE_CANDIDATE, /* candidate pages to promote */
#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -307,6 +294,8 @@ static inline bool is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
+#define WORKINGSET_ANON 0
+#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2
enum lruvec_flags {
@@ -315,6 +304,207 @@ enum lruvec_flags {
*/
};
+#endif /* !__GENERATING_BOUNDS_H */
+
+/*
+ * Evictable pages are divided into multiple generations. The youngest and the
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ *
+ * A page is added to the youngest generation on faulting. The aging needs to
+ * check the accessed bit at least twice before handing this page over to the
+ * eviction. The first check takes care of the accessed bit set on the initial
+ * fault; the second check makes sure this page hasn't been used since then.
+ * This process, AKA second chance, requires a minimum of two generations,
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
+ * rest of generations, if they exist, are considered inactive. See
+ * lru_gen_is_active().
+ *
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+ * the aging needs not to worry about it. And it's set again when a page
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
+ * See lru_gen_add_folio() and lru_gen_del_folio().
+ *
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
+ * in folio->flags.
+ */
+#define MIN_NR_GENS 2U
+#define MAX_NR_GENS 4U
+
+/*
+ * Each generation is divided into multiple tiers. A page accessed N times
+ * through file descriptors is in tier order_base_2(N). A page in the first tier
+ * (N=0,1) is marked by PG_referenced unless it was faulted in through page
+ * tables or read ahead. A page in any other tier (N>1) is marked by
+ * PG_referenced and PG_workingset. This implies a minimum of two tiers is
+ * supported without using additional bits in folio->flags.
+ *
+ * In contrast to moving across generations which requires the LRU lock, moving
+ * across tiers only involves atomic operations on folio->flags and therefore
+ * has a negligible cost in the buffered access path. In the eviction path,
+ * comparisons of refaulted/(evicted+protected) from the first tier and the
+ * rest infer whether pages accessed multiple times through file descriptors
+ * are statistically hot and thus worth protecting.
+ *
+ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
+ * folio->flags.
+ */
+#define MAX_NR_TIERS 4U
+
+#ifndef __GENERATING_BOUNDS_H
+
+struct lruvec;
+struct page_vma_mapped_walk;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+enum {
+ LRU_GEN_ANON,
+ LRU_GEN_FILE,
+};
+
+enum {
+ LRU_GEN_CORE,
+ LRU_GEN_MM_WALK,
+ LRU_GEN_NONLEAF_YOUNG,
+ NR_LRU_GEN_CAPS
+};
+
+#define MIN_LRU_BATCH BITS_PER_LONG
+#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
+
+/* whether to keep historical stats from evicted generations */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_HIST_GENS MAX_NR_GENS
+#else
+#define NR_HIST_GENS 1U
+#endif
+
+/*
+ * The youngest generation number is stored in max_seq for both anon and file
+ * types as they are aged on an equal footing. The oldest generation numbers are
+ * stored in min_seq[] separately for anon and file types as clean file pages
+ * can be evicted regardless of swap constraints.
+ *
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
+ * min_seq behind.
+ *
+ * The number of pages in each generation is eventually consistent and therefore
+ * can be transiently negative when reset_batch_size() is pending.
+ */
+struct lru_gen_struct {
+ /* the aging increments the youngest generation number */
+ unsigned long max_seq;
+ /* the eviction increments the oldest generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
+ /* the multi-gen LRU lists, lazily sorted on eviction */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the multi-gen LRU sizes, eventually consistent */
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the exponential moving average of refaulted */
+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
+ /* the exponential moving average of evicted+protected */
+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
+ /* the first tier doesn't need protection, hence the minus one */
+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+ /* can be modified without holding the LRU lock */
+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
+};
+
+enum {
+ MM_LEAF_TOTAL, /* total leaf entries */
+ MM_LEAF_OLD, /* old leaf entries */
+ MM_LEAF_YOUNG, /* young leaf entries */
+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
+ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
+ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
+ NR_MM_STATS
+};
+
+/* double-buffering Bloom filters */
+#define NR_BLOOM_FILTERS 2
+
+struct lru_gen_mm_state {
+ /* set to max_seq after each iteration */
+ unsigned long seq;
+ /* where the current iteration continues (inclusive) */
+ struct list_head *head;
+ /* where the last iteration ended (exclusive) */
+ struct list_head *tail;
+ /* to wait for the last page table walker to finish */
+ struct wait_queue_head wait;
+ /* Bloom filters flip after each iteration */
+ unsigned long *filters[NR_BLOOM_FILTERS];
+ /* the mm stats for debugging */
+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+ /* the number of concurrent page table walkers */
+ int nr_walkers;
+};
+
+struct lru_gen_mm_walk {
+ /* the lruvec under reclaim */
+ struct lruvec *lruvec;
+ /* unstable max_seq from lru_gen_struct */
+ unsigned long max_seq;
+ /* the next address within an mm to scan */
+ unsigned long next_addr;
+ /* to batch promoted pages */
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* to batch the mm stats */
+ int mm_stats[NR_MM_STATS];
+ /* total batched items */
+ int batched;
+ bool can_swap;
+ bool force_scan;
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -332,6 +522,12 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* evictable pages divided into generations */
+ struct lru_gen_struct lrugen;
+ /* to concurrently iterate lru_gen_mm_list */
+ struct lru_gen_mm_state mm_state;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
@@ -369,13 +565,6 @@ enum zone_watermarks {
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
-/*
- * Shift to encode migratetype and order in the same integer, with order
- * in the least significant bits.
- */
-#define NR_PCP_ORDER_WIDTH 8
-#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
-
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
@@ -628,7 +817,7 @@ struct zone {
int initialized;
/* Write-intensive fields used from the page allocator */
- ZONE_PADDING(_pad1_)
+ CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
@@ -640,7 +829,7 @@ struct zone {
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
- ZONE_PADDING(_pad2_)
+ CACHELINE_PADDING(_pad2_);
/*
* When free pages are below this point, additional steps are taken
@@ -677,7 +866,7 @@ struct zone {
bool contiguous;
- ZONE_PADDING(_pad3_)
+ CACHELINE_PADDING(_pad3_);
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
@@ -747,6 +936,8 @@ static inline bool zone_is_empty(struct zone *zone)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
@@ -954,8 +1145,10 @@ typedef struct pglist_data {
atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
unsigned long nr_reclaim_start; /* nr pages written while throttled
* when throttling started. */
- struct task_struct *kswapd; /* Protected by
- mem_hotplug_begin/done() */
+#ifdef CONFIG_MEMORY_HOTPLUG
+ struct mutex kswapd_lock;
+#endif
+ struct task_struct *kswapd; /* Protected by kswapd_lock */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
@@ -983,7 +1176,7 @@ typedef struct pglist_data {
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
- ZONE_PADDING(_pad1_)
+ CACHELINE_PADDING(_pad1_);
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
@@ -997,6 +1190,21 @@ typedef struct pglist_data {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /* start time in ms of current promote rate limit period */
+ unsigned int nbp_rl_start;
+ /* number of promote candidate pages at start time of current rate limit period */
+ unsigned long nbp_rl_nr_cand;
+ /* promote threshold in ms */
+ unsigned int nbp_threshold;
+ /* start time in ms of current promote threshold adjustment period */
+ unsigned int nbp_th_start;
+ /*
+ * number of promote candidate pages at stat time of current promote
+ * threshold adjustment period
+ */
+ unsigned long nbp_th_nr_cand;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
@@ -1008,11 +1216,19 @@ typedef struct pglist_data {
unsigned long flags;
- ZONE_PADDING(_pad2_)
+#ifdef CONFIG_LRU_GEN
+ /* kswap mm walk data */
+ struct lru_gen_mm_walk mm_walk;
+#endif
+
+ CACHELINE_PADDING(_pad2_);
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
+#ifdef CONFIG_NUMA
+ struct memory_tier __rcu *memtier;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -1026,11 +1242,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}
-static inline bool pgdat_is_empty(pg_data_t *pgdat)
-{
- return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
-}
-
#include <linux/memory_hotplug.h>
void build_all_zonelists(pg_data_t *pgdat);
diff --git a/include/linux/node.h b/include/linux/node.h
index 40d641a8bfb0..427a5975cf40 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -2,15 +2,15 @@
/*
* include/linux/node.h - generic node definition
*
- * This is mainly for topological representation. We define the
- * basic 'struct node' here, which can be embedded in per-arch
+ * This is mainly for topological representation. We define the
+ * basic 'struct node' here, which can be embedded in per-arch
* definitions of processors.
*
* Basic handling of the devices is done in drivers/base/node.c
- * and system devices are handled in drivers/base/sys.c.
+ * and system devices are handled in drivers/base/sys.c.
*
* Nodes are exported via driverfs in the class/node/devices/
- * directory.
+ * directory.
*/
#ifndef _LINUX_NODE_H_
#define _LINUX_NODE_H_
@@ -18,7 +18,6 @@
#include <linux/device.h>
#include <linux/cpumask.h>
#include <linux/list.h>
-#include <linux/workqueue.h>
/**
* struct node_hmem_attrs - heterogeneous memory performance attributes
@@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid,
struct node {
struct device dev;
struct list_head access_list;
-
-#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
- struct work_struct node_work;
-#endif
#ifdef CONFIG_HMEM_REPORTING
struct list_head cache_attrs;
struct device *cache_dev;
@@ -96,7 +91,6 @@ struct node {
struct memory_block;
extern struct node *node_devices[];
-typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
@@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
extern int register_memory_node_under_compute_node(unsigned int mem_nid,
unsigned int cpu_nid,
unsigned access);
-
-#ifdef CONFIG_HUGETLBFS
-extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
- node_registration_func_t unregister);
-#endif
#else
static inline void node_dev_init(void)
{
@@ -176,18 +165,8 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
{
}
-
-static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
- node_registration_func_t unreg)
-{
-}
#endif
#define to_node(device) container_of(device, struct node, dev)
-static inline bool node_is_toptier(int node)
-{
- return node_state(node, N_CPU);
-}
-
#endif /* _LINUX_NODE_H_ */
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 0c45fb066caa..378956c93c94 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
@@ -504,11 +505,20 @@ static inline int num_node_state(enum node_states state)
static inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
- int w, bit = NUMA_NO_NODE;
+ int w, bit;
w = nodes_weight(*maskp);
- if (w)
+ switch (w) {
+ case 0:
+ bit = NUMA_NO_NODE;
+ break;
+ case 1:
+ bit = first_node(*maskp);
+ break;
+ default:
bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_int() % w);
+ break;
+ }
return bit;
#else
return 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 02d1e7bbd8cd..7d0c9c48a0c5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -78,15 +78,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
}
/*
- * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
- */
-static inline bool mm_is_oom_victim(struct mm_struct *mm)
-{
- return test_bit(MMF_OOM_VICTIM, &mm->flags);
-}
-
-/*
* Checks whether a page fault on the given mm is still reliable.
* This is no longer true if the oom reaper started to reap the
* address space which is reflected by MMF_UNSTABLE flag set in
@@ -106,8 +97,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
return 0;
}
-bool __oom_reap_task_mm(struct mm_struct *mm);
-
long oom_badness(struct task_struct *p,
unsigned long totalpages);
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index ef1e3e736e14..7d79818dc065 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,15 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
+/* see the comment on MAX_NR_TIERS */
+#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
+ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
+ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
+
#endif
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 465ff35a8c00..0b0ae5084e60 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 679591301994..c141ea9a95ef 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -3,15 +3,17 @@
#define _LINUX_PAGE_COUNTER_H
#include <linux/atomic.h>
+#include <linux/cache.h>
#include <linux/kernel.h>
#include <asm/page.h>
struct page_counter {
+ /*
+ * Make sure 'usage' does not share cacheline with any other field. The
+ * memcg->memory.usage is a hot member of struct mem_cgroup.
+ */
atomic_long_t usage;
- unsigned long min;
- unsigned long low;
- unsigned long high;
- unsigned long max;
+ CACHELINE_PADDING(_pad1_);
/* effective memory.min and memory.min usage tracking */
unsigned long emin;
@@ -23,18 +25,18 @@ struct page_counter {
atomic_long_t low_usage;
atomic_long_t children_low_usage;
- /* legacy */
unsigned long watermark;
unsigned long failcnt;
- /*
- * 'parent' is placed here to be far from 'usage' to reduce
- * cache false sharing, as 'usage' is written mostly while
- * parent is frequently read for cgroup's hierarchical
- * counting nature.
- */
+ /* Keep all the read most fields in a separete cacheline. */
+ CACHELINE_PADDING(_pad2_);
+
+ unsigned long min;
+ unsigned long low;
+ unsigned long high;
+ unsigned long max;
struct page_counter *parent;
-};
+} ____cacheline_internodealigned_in_smp;
#if BITS_PER_LONG == 32
#define PAGE_COUNTER_MAX LONG_MAX
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..22be4582faae 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -36,9 +36,15 @@ struct page_ext {
unsigned long flags;
};
+extern bool early_page_ext;
extern unsigned long page_ext_size;
extern void pgdat_page_ext_init(struct pglist_data *pgdat);
+static inline bool early_page_ext_enabled(void)
+{
+ return early_page_ext;
+}
+
#ifdef CONFIG_SPARSEMEM
static inline void page_ext_init_flatmem(void)
{
@@ -55,7 +61,8 @@ static inline void page_ext_init(void)
}
#endif
-struct page_ext *lookup_page_ext(const struct page *page);
+extern struct page_ext *page_ext_get(struct page *page);
+extern void page_ext_put(struct page_ext *page_ext);
static inline struct page_ext *page_ext_next(struct page_ext *curr)
{
@@ -67,13 +74,13 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
-static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
+static inline bool early_page_ext_enabled(void)
{
+ return false;
}
-static inline struct page_ext *lookup_page_ext(const struct page *page)
+static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
{
- return NULL;
}
static inline void page_ext_init(void)
@@ -87,5 +94,14 @@ static inline void page_ext_init_flatmem_late(void)
static inline void page_ext_init_flatmem(void)
{
}
+
+static inline struct page_ext *page_ext_get(struct page *page)
+{
+ return NULL;
+}
+
+static inline void page_ext_put(struct page_ext *page_ext)
+{
+}
#endif /* CONFIG_PAGE_EXTENSION */
#endif /* __LINUX_PAGE_EXT_H */
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 4663dfed1293..5cb7bd2078ec 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -13,65 +13,79 @@
* If there is not enough space to store Idle and Young bits in page flags, use
* page ext flags instead.
*/
-
static inline bool folio_test_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_young;
if (unlikely(!page_ext))
return false;
- return test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_young;
}
static inline void folio_set_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
set_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
}
static inline bool folio_test_clear_young(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_young;
if (unlikely(!page_ext))
return false;
- return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_young;
}
static inline bool folio_test_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
+ bool page_idle;
if (unlikely(!page_ext))
return false;
- return test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
+
+ return page_idle;
}
static inline void folio_set_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
set_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
}
static inline void folio_clear_idle(struct folio *folio)
{
- struct page_ext *page_ext = lookup_page_ext(&folio->page);
+ struct page_ext *page_ext = page_ext_get(&folio->page);
if (unlikely(!page_ext))
return;
clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
+ page_ext_put(page_ext);
}
#endif /* !CONFIG_64BIT */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 83c7248053a1..5f1ae07d724b 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -53,6 +53,10 @@ extern unsigned int pageblock_order;
#endif /* CONFIG_HUGETLB_PAGE */
#define pageblock_nr_pages (1UL << pageblock_order)
+#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages)
+#define pageblock_aligned(pfn) IS_ALIGNED((pfn), pageblock_nr_pages)
+#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages)
+#define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages)
/* Forward declaration */
struct page;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 201dc7281640..bbccb4044222 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -718,8 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index)
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch);
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages);
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages);
@@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page)
}
/*
- * lock_page_or_retry - Lock the page, unless this would block and the
+ * folio_lock_or_retry - Lock the folio, unless this would block and the
* caller indicated that it can handle a retry.
*
* Return value and mmap_lock implications depend on flags; see
* __folio_lock_or_retry().
*/
-static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm,
- unsigned int flags)
+static inline bool folio_lock_or_retry(struct folio *folio,
+ struct mm_struct *mm, unsigned int flags)
{
- struct folio *folio;
might_sleep();
-
- folio = page_folio(page);
return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
}
@@ -1042,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page)
return folio_wait_locked_killable(page_folio(page));
}
-int folio_put_wait_locked(struct folio *folio, int state);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index ac7b38ad5903..f3fafb731ffd 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -15,12 +15,12 @@ struct mm_walk;
* this handler is required to be able to handle
* pmd_trans_huge() pmds. They may simply choose to
* split_huge_page() instead of handling it explicitly.
- * @pte_entry: if set, called for each non-empty PTE (lowest-level)
- * entry
+ * @pte_entry: if set, called for each PTE (lowest-level) entry,
+ * including empty ones
* @pte_hole: if set, called for each hole at all levels,
- * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD
- * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal
- * to 1) are skipped.
+ * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
+ * Any folded depths (where PTRS_PER_P?D is equal to 1)
+ * are skipped.
* @hugetlb_entry: if set, called for each hugetlb entry
* @test_walk: caller specific callback function to determine whether
* we walk over the current vma or not. Returning 0 means
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaa..a108b60a6962 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported on the local CPU.
+ *
+ * This stub assumes accessing through an old PTE triggers a page fault.
+ * Architectures that automatically set the access bit should overwrite it.
+ */
+static inline bool arch_has_hw_pte_young(void)
+{
+ return false;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
@@ -1276,8 +1289,7 @@ static inline int pgd_devmap(pgd_t pgd)
#endif
#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
- (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
- !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
return 0;
@@ -1598,11 +1610,7 @@ typedef unsigned int pgtbl_mod_mask;
#endif
#ifndef has_transparent_hugepage
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define has_transparent_hugepage() 1
-#else
-#define has_transparent_hugepage() 0
-#endif
+#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif
/*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b89b4b86951f..bd3504d11b15 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -166,7 +166,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
unlink_anon_vmas(next);
}
-struct anon_vma *page_get_anon_vma(struct page *page);
+struct anon_vma *folio_get_anon_vma(struct folio *folio);
/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;
@@ -270,7 +270,7 @@ dup:
* @page: the exclusive anonymous page to try marking possibly shared
*
* The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ * cleared/invalidated.
*
* This is similar to page_try_dup_anon_rmap(), however, not used during fork()
* to duplicate a mapping, but instead to prepare for KSM or temporarily
@@ -286,12 +286,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
{
VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
- /* See page_try_dup_anon_rmap(). */
- if (likely(!is_device_private_page(page) &&
- unlikely(page_maybe_dma_pinned(page))))
- return -EBUSY;
+ /* device private pages cannot get pinned via GUP. */
+ if (unlikely(is_device_private_page(page))) {
+ ClearPageAnonExclusive(page);
+ return 0;
+ }
+
+ /*
+ * We have to make sure that when we clear PageAnonExclusive, that
+ * the page is not pinned and that concurrent GUP-fast won't succeed in
+ * concurrently pinning the page.
+ *
+ * Conceptually, PageAnonExclusive clearing consists of:
+ * (A1) Clear PTE
+ * (A2) Check if the page is pinned; back off if so.
+ * (A3) Clear PageAnonExclusive
+ * (A4) Restore PTE (optional, but certainly not writable)
+ *
+ * When clearing PageAnonExclusive, we cannot possibly map the page
+ * writable again, because anon pages that may be shared must never
+ * be writable. So in any case, if the PTE was writable it cannot
+ * be writable anymore afterwards and there would be a PTE change. Only
+ * if the PTE wasn't writable, there might not be a PTE change.
+ *
+ * Conceptually, GUP-fast pinning of an anon page consists of:
+ * (B1) Read the PTE
+ * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
+ * (B3) Pin the mapped page
+ * (B4) Check if the PTE changed by re-reading it; back off if so.
+ * (B5) If the original PTE is not writable, check if
+ * PageAnonExclusive is not set; back off if so.
+ *
+ * If the PTE was writable, we only have to make sure that GUP-fast
+ * observes a PTE change and properly backs off.
+ *
+ * If the PTE was not writable, we have to make sure that GUP-fast either
+ * detects a (temporary) PTE change or that PageAnonExclusive is cleared
+ * and properly backs off.
+ *
+ * Consequently, when clearing PageAnonExclusive(), we have to make
+ * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
+ * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
+ * and (B5) happen in the right memory order.
+ *
+ * We assume that there might not be a memory barrier after
+ * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
+ * so we use explicit ones here.
+ */
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+
+ if (unlikely(page_maybe_dma_pinned(page)))
+ return -EBUSY;
ClearPageAnonExclusive(page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
return 0;
}
@@ -405,13 +461,8 @@ struct rmap_walk_control {
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
-
-/*
- * Called by memory-failure.c to kill processes.
- */
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
struct rmap_walk_control *rwc);
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
#else /* !CONFIG_MMU */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11a317810566..77f68f8b795c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,6 +14,7 @@
#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
+#include <linux/kmsan_types.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
@@ -870,7 +871,6 @@ struct task_struct {
struct mm_struct *active_mm;
/* Per-thread vma caching: */
- struct vmacache vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
@@ -923,6 +923,10 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* whether the LRU algorithm may apply to this access */
+ unsigned in_lru_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
@@ -953,6 +957,10 @@ struct task_struct {
#ifdef CONFIG_CPU_SUP_INTEL
unsigned reported_split_lock:1;
#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ /* delay due to memory thrashing */
+ unsigned in_thrashing:1;
+#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
@@ -1364,6 +1372,10 @@ struct task_struct {
#endif
#endif
+#ifdef CONFIG_KMSAN
+ struct kmsan_ctx kmsan_ctx;
+#endif
+
#if IS_ENABLED(CONFIG_KUNIT)
struct kunit *kunit_test;
#endif
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 4d0a5be28b70..8270ad7ae14c 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_OOM_VICTIM 25 /* mm is the oom victim */
-#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */
-#define MMF_MULTIPROCESS 27 /* mm is shared between processes */
+#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
+#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
* MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either
* replaced in the future by mm.pinned_vm when it becomes stable, or grow into
@@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm)
* pinned pages were unpinned later on, we'll still keep this bit set for the
* lifecycle of this mm, just for simplicity.
*/
-#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */
+#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index e650946816d0..303ee7dd0c7e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -27,6 +27,7 @@ enum sched_tunable_scaling {
#ifdef CONFIG_NUMA_BALANCING
extern int sysctl_numa_balancing_mode;
+extern unsigned int sysctl_numa_balancing_promote_rate_limit;
#else
#define sysctl_numa_balancing_mode 0
#endif
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff0b990de83d..d500ea967dc7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -92,17 +92,19 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);
-extern bool shmem_is_huge(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index);
-static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force);
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma,
+ bool shmem_huge_force)
{
- return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff);
+ return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff,
+ shmem_huge_force);
}
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
-/* Flag allocation requirements to shmem_getpage */
+/* Flag allocation requirements to shmem_get_folio */
enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */
@@ -111,8 +113,8 @@ enum sgp_type {
SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
-extern int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp);
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+ enum sgp_type sgp);
static inline struct page *shmem_read_mapping_page(
struct address_space *mapping, pgoff_t index)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6a613e65e78d..90877fcde70b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -108,7 +108,7 @@
# define SLAB_ACCOUNT 0
#endif
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN ((slab_flags_t __force)0x08000000U)
#else
#define SLAB_KASAN 0
@@ -121,6 +121,12 @@
*/
#define SLAB_NO_USER_FLAGS ((slab_flags_t __force)0x10000000U)
+#ifdef CONFIG_KFENCE
+#define SLAB_SKIP_KFENCE ((slab_flags_t __force)0x20000000U)
+#else
+#define SLAB_SKIP_KFENCE 0
+#endif
+
/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U)
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index bc2797955de9..9ca7798d7a31 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -14,9 +14,15 @@
#include <linux/gfp.h>
typedef u32 depot_stack_handle_t;
+/*
+ * Number of bits in the handle that stack depot doesn't use. Users may store
+ * information in them.
+ */
+#define STACK_DEPOT_EXTRA_BITS 5
depot_stack_handle_t __stack_depot_save(unsigned long *entries,
unsigned int nr_entries,
+ unsigned int extra_bits,
gfp_t gfp_flags, bool can_alloc);
/*
@@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int stack_depot_fetch(depot_stack_handle_t handle,
unsigned long **entries);
+unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle);
+
int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
int spaces);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 43150b9bbc5c..a18cf4b7c724 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -162,6 +162,10 @@ union swap_header {
*/
struct reclaim_state {
unsigned long reclaimed_slab;
+#ifdef CONFIG_LRU_GEN
+ /* per-thread mm walk data */
+ struct lru_gen_mm_walk *mm_walk;
+#endif
};
#ifdef __KERNEL__
@@ -351,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
return entry;
}
+static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
+{
+ folio->private = (void *)entry.val;
+}
+
/* linux/mm/workingset.c */
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
@@ -375,11 +384,11 @@ extern unsigned long totalreserve_pages;
/* linux/mm/swap.c */
-extern void lru_note_cost(struct lruvec *lruvec, bool file,
- unsigned int nr_pages);
-extern void lru_note_cost_folio(struct folio *);
-extern void folio_add_lru(struct folio *);
-extern void lru_cache_add(struct page *);
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages);
+void lru_note_cost_folio(struct folio *);
+void folio_add_lru(struct folio *);
+void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
+void lru_cache_add(struct page *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);
@@ -481,7 +490,8 @@ static inline long get_nr_swap_pages(void)
extern void si_swapinfo(struct sysinfo *);
swp_entry_t folio_alloc_swap(struct folio *folio);
-extern void put_swap_page(struct page *page, swp_entry_t entry);
+bool folio_free_swap(struct folio *folio);
+void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
@@ -500,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern int try_to_free_swap(struct page *);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
@@ -566,7 +575,7 @@ static inline void swap_free(swp_entry_t swp)
{
}
-static inline void put_swap_page(struct page *page, swp_entry_t swp)
+static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
{
}
@@ -585,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry)
return 0;
}
-static inline int try_to_free_swap(struct page *page)
-{
- return 0;
-}
-
static inline swp_entry_t folio_alloc_swap(struct folio *folio)
{
swp_entry_t entry;
@@ -597,6 +601,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio)
return entry;
}
+static inline bool folio_free_swap(struct folio *folio)
+{
+ return false;
+}
+
static inline int add_swap_extent(struct swap_info_struct *sis,
unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
@@ -657,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
cgroup_throttle_swaprate(&folio->page, gfp);
}
-#ifdef CONFIG_MEMCG_SWAP
+#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
@@ -677,7 +686,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p
}
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
-extern bool mem_cgroup_swap_full(struct page *page);
+extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
@@ -699,7 +708,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return get_nr_swap_pages();
}
-static inline bool mem_cgroup_swap_full(struct page *page)
+static inline bool mem_cgroup_swap_full(struct folio *folio)
{
return vm_swap_full();
}
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index a12dd1c3966c..ae73a87775b3 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -4,7 +4,7 @@
#include <linux/swap.h>
-#ifdef CONFIG_MEMCG_SWAP
+#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new);
@@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type)
return;
}
-#endif /* CONFIG_MEMCG_SWAP */
+#endif
#endif /* __LINUX_SWAP_CGROUP_H */
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 54078542134c..7ed529a77c5b 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -8,6 +8,11 @@
*/
extern struct swap_info_struct *swap_info[];
extern unsigned long generic_max_swapfile_size(void);
-extern unsigned long max_swapfile_size(void);
+unsigned long arch_max_swapfile_size(void);
+
+/* Maximum swapfile size supported for the arch (not inclusive). */
+extern unsigned long swapfile_maximum_size;
+/* Whether swap migration entry supports storing A/D bits for the arch */
+extern bool swap_migration_ad_supported;
#endif /* _LINUX_SWAPFILE_H */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index a3d435bf9f97..86b95ccb81bb 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -8,6 +8,10 @@
#ifdef CONFIG_MMU
+#ifdef CONFIG_SWAP
+#include <linux/swapfile.h>
+#endif /* CONFIG_SWAP */
+
/*
* swapcache pages are stored in the swapper_space radix tree. We want to
* get good packing density in that tree, so the index should be dense in
@@ -23,6 +27,45 @@
#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1)
+/*
+ * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To
+ * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries
+ * can use the extra bits to store other information besides PFN.
+ */
+#ifdef MAX_PHYSMEM_BITS
+#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#else /* MAX_PHYSMEM_BITS */
+#define SWP_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT)
+#endif /* MAX_PHYSMEM_BITS */
+#define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1)
+
+/**
+ * Migration swap entry specific bitfield definitions. Layout:
+ *
+ * |----------+--------------------|
+ * | swp_type | swp_offset |
+ * |----------+--------+-+-+-------|
+ * | | resv |D|A| PFN |
+ * |----------+--------+-+-+-------|
+ *
+ * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
+ * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
+ *
+ * Note: A/D bits will be stored in migration entries iff there're enough
+ * free bits in arch specific swp offset. By default we'll ignore A/D bits
+ * when migrating a page. Please refer to migration_entry_supports_ad()
+ * for more information. If there're more bits besides PFN and A/D bits,
+ * they should be reserved and always be zeros.
+ */
+#define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS)
+#define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1)
+#define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2)
+
+#define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT)
+#define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT)
+
+static inline bool is_pfn_swap_entry(swp_entry_t entry);
+
/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
@@ -64,6 +107,17 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
return entry.val & SWP_OFFSET_MASK;
}
+/*
+ * This should only be called upon a pfn swap entry to get the PFN stored
+ * in the swap entry. Please refers to is_pfn_swap_entry() for definition
+ * of pfn swap entry.
+ */
+static inline unsigned long swp_offset_pfn(swp_entry_t entry)
+{
+ VM_BUG_ON(!is_pfn_swap_entry(entry));
+ return swp_offset(entry) & SWP_PFN_MASK;
+}
+
/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
@@ -240,6 +294,52 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
return swp_entry(SWP_MIGRATION_WRITE, offset);
}
+/*
+ * Returns whether the host has large enough swap offset field to support
+ * carrying over pgtable A/D bits for page migrations. The result is
+ * pretty much arch specific.
+ */
+static inline bool migration_entry_supports_ad(void)
+{
+#ifdef CONFIG_SWAP
+ return swap_migration_ad_supported;
+#else /* CONFIG_SWAP */
+ return false;
+#endif /* CONFIG_SWAP */
+}
+
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_entry(swp_type(entry),
+ swp_offset(entry) | SWP_MIG_YOUNG);
+ return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_offset(entry) & SWP_MIG_YOUNG;
+ /* Keep the old behavior of aging page after migration */
+ return false;
+}
+
+static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_entry(swp_type(entry),
+ swp_offset(entry) | SWP_MIG_DIRTY);
+ return entry;
+}
+
+static inline bool is_migration_entry_dirty(swp_entry_t entry)
+{
+ if (migration_entry_supports_ad())
+ return swp_offset(entry) & SWP_MIG_DIRTY;
+ /* Keep the old behavior of clean page after migration */
+ return false;
+}
+
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
@@ -247,8 +347,8 @@ extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
#ifdef CONFIG_HUGETLB_PAGE
extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
-#endif
-#else
+#endif /* CONFIG_HUGETLB_PAGE */
+#else /* CONFIG_MIGRATION */
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
return swp_entry(0, 0);
@@ -276,7 +376,7 @@ static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
#ifdef CONFIG_HUGETLB_PAGE
static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
-#endif
+#endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry)
{
return 0;
@@ -286,7 +386,26 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
return 0;
}
-#endif
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+ return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
+{
+ return entry;
+}
+
+static inline bool is_migration_entry_dirty(swp_entry_t entry)
+{
+ return false;
+}
+#endif /* CONFIG_MIGRATION */
typedef unsigned long pte_marker;
@@ -369,7 +488,7 @@ static inline int pte_none_mostly(pte_t pte)
static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
{
- struct page *p = pfn_to_page(swp_offset(entry));
+ struct page *p = pfn_to_page(swp_offset_pfn(entry));
/*
* Any use of migration entries may only occur while the
@@ -387,6 +506,9 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
*/
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
+ /* Make sure the swp offset can always store the needed fields */
+ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+
return is_migration_entry(entry) || is_device_private_entry(entry) ||
is_device_exclusive_entry(entry);
}
@@ -426,7 +548,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
{
return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
-#else
+#else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
{
@@ -455,7 +577,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
{
return 0;
}
-#endif
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
@@ -475,27 +597,17 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
-static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
-{
- return swp_offset(entry);
-}
-
static inline void num_poisoned_pages_inc(void)
{
atomic_long_inc(&num_poisoned_pages);
}
-static inline void num_poisoned_pages_dec(void)
-{
- atomic_long_dec(&num_poisoned_pages);
-}
-
static inline void num_poisoned_pages_sub(long i)
{
atomic_long_sub(i, &num_poisoned_pages);
}
-#else
+#else /* CONFIG_MEMORY_FAILURE */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
@@ -514,7 +626,7 @@ static inline void num_poisoned_pages_inc(void)
static inline void num_poisoned_pages_sub(long i)
{
}
-#endif
+#endif /* CONFIG_MEMORY_FAILURE */
static inline int non_swap_entry(swp_entry_t entry)
{
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 47e5d374c7eb..afb18f198843 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -58,20 +58,28 @@
static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
- instrument_copy_from_user(to, from, n);
+ unsigned long res;
+
+ instrument_copy_from_user_before(to, from, n);
check_object_size(to, n, false);
- return raw_copy_from_user(to, from, n);
+ res = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, res);
+ return res;
}
static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
+ unsigned long res;
+
might_fault();
+ instrument_copy_from_user_before(to, from, n);
if (should_fail_usercopy())
return n;
- instrument_copy_from_user(to, from, n);
check_object_size(to, n, false);
- return raw_copy_from_user(to, from, n);
+ res = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, res);
+ return res;
}
/**
@@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
unsigned long res = n;
might_fault();
if (!should_fail_usercopy() && likely(access_ok(from, n))) {
- instrument_copy_from_user(to, from, n);
+ instrument_copy_from_user_before(to, from, n);
res = raw_copy_from_user(to, from, n);
+ instrument_copy_from_user_after(to, from, n, res);
}
if (unlikely(res))
memset(to + (n - res), 0, res);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e1b8a915e9e9..f07e6998bb68 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -175,9 +175,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start,
unsigned long end);
-extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct list_head *uf);
+extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf);
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
@@ -258,7 +257,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
}
-static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+static inline int userfaultfd_unmap_prep(struct mm_struct *mm,
unsigned long start, unsigned long end,
struct list_head *uf)
{
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f3fc36cd2276..3518dba1e02f 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -129,10 +129,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- VMACACHE_FIND_CALLS,
- VMACACHE_FIND_HITS,
-#endif
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
deleted file mode 100644
index 6fce268a4588..000000000000
--- a/include/linux/vmacache.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_VMACACHE_H
-#define __LINUX_VMACACHE_H
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-
-static inline void vmacache_flush(struct task_struct *tsk)
-{
- memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas));
-}
-
-extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
-extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
- unsigned long addr);
-
-#ifndef CONFIG_MMU
-extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
-#endif
-
-static inline void vmacache_invalidate(struct mm_struct *mm)
-{
- mm->vmacache_seqnum++;
-}
-
-#endif /* __LINUX_VMACACHE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index bfe38869498d..19cf5b6892ce 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
-#ifdef CONFIG_DEBUG_VM_VMACACHE
-#define count_vm_vmacache_event(x) count_vm_event(x)
-#else
-#define count_vm_vmacache_event(x) do {} while (0)
-#endif
-
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3f045f6d6c4f..06f9291b6fd5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -17,20 +17,12 @@ struct bio;
DECLARE_PER_CPU(int, dirty_throttle_leaks);
/*
- * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
- *
- * (thresh - thresh/DIRTY_FULL_SCOPE, thresh)
- *
- * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
- * time) for the dirty pages to drop, unless written enough pages.
- *
* The global dirty threshold is normally equal to the global dirty limit,
* except when the system suddenly allocates a lot of anonymous memory and
* knocks down the global dirty threshold quickly, in which case the global
* dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
*/
#define DIRTY_SCOPE 8
-#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2)
struct backing_dev_info;