From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:57 +0200 Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl This extends the current PR_SET_MDWE prctl arg with a bit to indicate that the process doesn't want MDWE protection to propagate to children. To implement this no-inherit mode, the tag in current->mm->flags must be absent from MMF_INIT_MASK. This means that the encoding for "MDWE but without inherit" is different in the prctl than in the mm flags. This leads to a bit of bit-mangling in the prctl implementation. Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org Signed-off-by: Florent Revest Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: David Hildenbrand Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/sched') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0ee96ea7a0e9..1b37fa8fc723 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm) MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #define MMF_VM_MERGE_ANY 29 +#define MMF_HAS_MDWE_NO_INHERIT 30 + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_SCHED_COREDUMP_H */ -- cgit v1.2.3 From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:17 +0800 Subject: sched/numa, mm: make numa migrate functions to take a folio The cpupid (or access time) is stored in the head page for THP, so it is safely to make should_numa_migrate_memory() and numa_hint_fault_latency() to take a folio. This is in preparation for large folio numa balancing. Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/sched/numa_balancing.h | 6 +++--- kernel/sched/fair.c | 12 ++++++------ mm/mempolicy.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..06a9d35650f0 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p, bool final); -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, - int src_nid, int dst_cpu); +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, + int src_nid, int dst_cpu); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final) { } static inline bool should_numa_migrate_memory(struct task_struct *p, - struct page *page, int src_nid, int dst_cpu) + struct folio *folio, int src_nid, int dst_cpu) { return true; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb225921bbca..42aefe7e6fdc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1722,12 +1722,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) * The smaller the hint page fault latency, the higher the possibility * for the page to be hot. */ -static int numa_hint_fault_latency(struct page *page) +static int numa_hint_fault_latency(struct folio *folio) { int last_time, time; time = jiffies_to_msecs(jiffies); - last_time = xchg_page_access_time(page, time); + last_time = xchg_page_access_time(&folio->page, time); return (time - last_time) & PAGE_ACCESS_TIME_MASK; } @@ -1784,7 +1784,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, } } -bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { struct numa_group *ng = deref_curr_numa_group(p); @@ -1814,16 +1814,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; - latency = numa_hint_fault_latency(page); + latency = numa_hint_fault_latency(folio); if (latency >= th) return false; return !numa_promotion_rate_limit(pgdat, rate_limit, - thp_nr_pages(page)); + folio_nr_pages(folio)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid); if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69c0eac7292c..abd94f4c7f6b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2642,7 +2642,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, &folio->page, curnid, + if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } -- cgit v1.2.3 From 3c6f33b7273a7e2f2b2497b62c8400bd957b2fbe Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 22 Sep 2023 14:11:40 -0700 Subject: mm/ksm: support fork/exec for prctl Patch series "mm/ksm: add fork-exec support for prctl", v4. A process can enable KSM with the prctl system call. When the process is forked the KSM flag is inherited by the child process. However if the process is executing an exec system call directly after the fork, the KSM setting is cleared. This patch series addresses this problem. 1) Change the mask in coredump.h for execing a new process 2) Add a new test case in ksm_functional_tests This patch (of 2): Today we have two ways to enable KSM: 1) madvise system call This allows to enable KSM for a memory region for a long time. 2) prctl system call This is a recent addition to enable KSM for the complete process. In addition when a process is forked, the KSM setting is inherited. This change only affects the second case. One of the use cases for (2) was to support the ability to enable KSM for cgroups. This allows systemd to enable KSM for the seed process. By enabling it in the seed process all child processes inherit the setting. This works correctly when the process is forked. However it doesn't support fork/exec workflow. From the previous cover letter: .... Use case 3: With the madvise call sharing opportunities are only enabled for the current process: it is a workload-local decision. A considerable number of sharing opportunities may exist across multiple workloads or jobs (if they are part of the same security domain). Only a higler level entity like a job scheduler or container can know for certain if its running one or more instances of a job. That job scheduler however doesn't have the necessary internal workload knowledge to make targeted madvise calls. .... In addition it can also be a bit surprising that fork keeps the KSM setting and fork/exec does not. Link: https://lkml.kernel.org/r/20230922211141.320789-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230922211141.320789-2-shr@devkernel.io Signed-off-by: Stefan Roesch Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Reviewed-by: David Hildenbrand Reported-by: Carl Klemm Tested-by: Carl Klemm Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 1b37fa8fc723..02f5090ffea2 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -85,13 +86,15 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_MDWE 28 #define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) +#define MMF_HAS_MDWE_NO_INHERIT 29 -#define MMF_VM_MERGE_ANY 29 -#define MMF_HAS_MDWE_NO_INHERIT 30 +#define MMF_VM_MERGE_ANY 30 +#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ + MMF_VM_MERGE_ANY_MASK) static inline unsigned long mmf_init_flags(unsigned long flags) { -- cgit v1.2.3 From e86828e5446d95676835679837d995dec188d2be Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:44 -0700 Subject: mm: kmem: scoped objcg protection Switch to a scope-based protection of the objcg pointer on slab/kmem allocation paths. Instead of using the get_() semantics in the pre-allocation hook and put the reference afterwards, let's rely on the fact that objcg is pinned by the scope. It's possible because: 1) if the objcg is received from the current task struct, the task is keeping a reference to the objcg. 2) if the objcg is received from an active memcg (remote charging), the memcg is pinned by the scope and has a reference to the corresponding objcg. Link: https://lkml.kernel.org/r/20231019225346.1822282-5-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 9 +++++++++ include/linux/sched/mm.h | 4 ++++ mm/memcontrol.c | 47 ++++++++++++++++++++++++++++++++++++++++++++-- mm/slab.h | 15 ++++++++------- 4 files changed, 66 insertions(+), 9 deletions(-) (limited to 'include/linux/sched') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc110cc8fdfc..8006bc3bd7bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1796,6 +1796,15 @@ bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); +/* + * The returned objcg pointer is safe to use without additional + * protection within a scope. The scope is defined either by + * the current task (similar to the "current" global variable) + * or by set_active_memcg() pair. + * Please, use obj_cgroup_get() to get a reference if the pointer + * needs to be used outside of the local scope. + */ +struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_current(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8d89c8c4fac1..9a19f1b42f64 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * + * Please, make sure that caller has a reference to the passed memcg structure, + * so its lifetime is guaranteed to exceed the scope between two + * set_active_memcg() calls. + * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff036d5d339d..a6457c8b5e16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3170,6 +3170,49 @@ from_memcg: return objcg; } +__always_inline struct obj_cgroup *current_obj_cgroup(void) +{ + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; + + if (in_task()) { + memcg = current->active_memcg; + if (unlikely(memcg)) + goto from_memcg; + + objcg = READ_ONCE(current->objcg); + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) + objcg = current_objcg_update(); + /* + * Objcg reference is kept by the task, so it's safe + * to use the objcg by the current task. + */ + return objcg; + } + + memcg = this_cpu_read(int_active_memcg); + if (unlikely(memcg)) + goto from_memcg; + + return NULL; + +from_memcg: + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + /* + * Memcg pointer is protected by scope (see set_active_memcg()) + * and is pinning the corresponding objcg, so objcg can't go + * away and can be used within the scope without any additional + * protection. + */ + objcg = rcu_dereference_check(memcg->objcg, 1); + if (likely(objcg)) + break; + objcg = NULL; + } + + return objcg; +} + struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; @@ -3264,15 +3307,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct obj_cgroup *objcg; int ret = 0; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (objcg) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { + obj_cgroup_get(objcg); page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - obj_cgroup_put(objcg); } return ret; } diff --git a/mm/slab.h b/mm/slab.h index 799a315695c6..3d07fb428393 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -484,7 +484,12 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); if (!objcg) return true; @@ -497,17 +502,14 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, css_put(&memcg->css); if (ret) - goto out; + return false; } if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - goto out; + return false; *objcgp = objcg; return true; -out: - obj_cgroup_put(objcg); - return false; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, @@ -542,7 +544,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_uncharge(objcg, obj_full_size(s)); } } - obj_cgroup_put(objcg); } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, -- cgit v1.2.3