From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 19:05:56 +0200 Subject: mm: remove remnants of SPLIT_RSS_COUNTING The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), but the patch failed to fully clean it up. Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Acked-by: Shakeel Butt Signed-off-by: Andrew Morton --- kernel/exit.c | 4 ---- kernel/fork.c | 4 ---- kernel/kthread.c | 1 - 3 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index edb50b4c9972..3cdbe797008f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -539,7 +539,6 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; - sync_mm_rss(mm); mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); @@ -829,9 +828,6 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..1779183a7cb3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process( p->io_uring = NULL; #endif -#if defined(SPLIT_RSS_COUNTING) - memset(&p->rss_stat, 0, sizeof(p->rss_stat)); -#endif - p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI diff --git a/kernel/kthread.c b/kernel/kthread.c index 1eea53050bab..c46128ec0c0a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1469,7 +1469,6 @@ void kthread_unuse_mm(struct mm_struct *mm) * clearing tsk->mm. */ smp_mb__after_spinlock(); - sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); -- cgit v1.2.3 From 2fbacff0cbf58530fa459c3f28ba9125df33ad86 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:14 +0800 Subject: rcu: dynamically allocate the rcu-lazy shrinker Use new APIs to dynamically allocate the rcu-lazy shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-16-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Joel Fernandes (Google) Acked-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- kernel/rcu/tree_nocb.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 5598212d1f27..4efbf7333d4e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1396,13 +1396,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return count ? count : SHRINK_STOP; } - -static struct shrinker lazy_rcu_shrinker = { - .count_objects = lazy_rcu_shrink_count, - .scan_objects = lazy_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; #endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) @@ -1410,6 +1403,7 @@ void __init rcu_init_nohz(void) int cpu; struct rcu_data *rdp; const struct cpumask *cpumask = NULL; + struct shrinker * __maybe_unused lazy_rcu_shrinker; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) @@ -1436,8 +1430,15 @@ void __init rcu_init_nohz(void) return; #ifdef CONFIG_RCU_LAZY - if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) - pr_err("Failed to register lazy_rcu shrinker!\n"); + lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy"); + if (!lazy_rcu_shrinker) { + pr_err("Failed to allocate lazy_rcu shrinker!\n"); + } else { + lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count; + lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan; + + shrinker_register(lazy_rcu_shrinker); + } #endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { -- cgit v1.2.3 From 21e0b932fb5d72bb6cefdf56e22bfddea968cf32 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:15 +0800 Subject: rcu: dynamically allocate the rcu-kfree shrinker Use new APIs to dynamically allocate the rcu-kfree shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-17-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Joel Fernandes (Google) Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- kernel/rcu/tree.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..06e2ed495c02 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3449,13 +3449,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return freed == 0 ? SHRINK_STOP : freed; } -static struct shrinker kfree_rcu_shrinker = { - .count_objects = kfree_rcu_shrink_count, - .scan_objects = kfree_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; - void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -4931,6 +4924,7 @@ static void __init kfree_rcu_batch_init(void) { int cpu; int i, j; + struct shrinker *kfree_rcu_shrinker; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -4962,8 +4956,17 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) - pr_err("Failed to register kfree_rcu() shrinker!\n"); + + kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); } void __init rcu_init(void) -- cgit v1.2.3 From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:57 +0200 Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl This extends the current PR_SET_MDWE prctl arg with a bit to indicate that the process doesn't want MDWE protection to propagate to children. To implement this no-inherit mode, the tag in current->mm->flags must be absent from MMF_INIT_MASK. This means that the encoding for "MDWE but without inherit" is different in the prctl than in the mm flags. This leads to a bit of bit-mangling in the prctl implementation. Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org Signed-off-by: Florent Revest Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: David Hildenbrand Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 10 ++++++++++ include/uapi/linux/prctl.h | 1 + kernel/fork.c | 2 +- kernel/sys.c | 32 ++++++++++++++++++++++++++------ tools/include/uapi/linux/prctl.h | 1 + 5 files changed, 39 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0ee96ea7a0e9..1b37fa8fc723 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm) MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #define MMF_VM_MERGE_ANY 29 +#define MMF_HAS_MDWE_NO_INHERIT 30 + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 9a85c69782bd..370ed14b1ae0 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -284,6 +284,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 # define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) #define PR_GET_MDWE 66 diff --git a/kernel/fork.c b/kernel/fork.c index 1779183a7cb3..e45a4457ba83 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1288,7 +1288,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, hugetlb_count_init(mm); if (current->mm) { - mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->flags = mmf_init_flags(current->mm->flags); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { mm->flags = default_dump_filter; diff --git a/kernel/sys.c b/kernel/sys.c index 2410e3999ebe..4a8073c1b255 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2368,19 +2368,41 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline unsigned long get_current_mdwe(void) +{ + unsigned long ret = 0; + + if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + ret |= PR_MDWE_REFUSE_EXEC_GAIN; + if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + ret |= PR_MDWE_NO_INHERIT; + + return ret; +} + static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + unsigned long current_bits; + if (arg3 || arg4 || arg5) return -EINVAL; - if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) + return -EINVAL; + + /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ + if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; + current_bits = get_current_mdwe(); + if (current_bits && current_bits != bits) + return -EPERM; /* Cannot unset the flags */ + + if (bits & PR_MDWE_NO_INHERIT) + set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) set_bit(MMF_HAS_MDWE, ¤t->mm->flags); - else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) - return -EPERM; /* Cannot unset the flag */ return 0; } @@ -2390,9 +2412,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, { if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - - return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? - PR_MDWE_REFUSE_EXEC_GAIN : 0; + return get_current_mdwe(); } static int prctl_get_auxv(void __user *addr, unsigned long len) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 9a85c69782bd..370ed14b1ae0 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -284,6 +284,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 # define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) #define PR_GET_MDWE 66 -- cgit v1.2.3 From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:17 +0800 Subject: sched/numa, mm: make numa migrate functions to take a folio The cpupid (or access time) is stored in the head page for THP, so it is safely to make should_numa_migrate_memory() and numa_hint_fault_latency() to take a folio. This is in preparation for large folio numa balancing. Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/sched/numa_balancing.h | 6 +++--- kernel/sched/fair.c | 12 ++++++------ mm/mempolicy.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..06a9d35650f0 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p, bool final); -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, - int src_nid, int dst_cpu); +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, + int src_nid, int dst_cpu); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final) { } static inline bool should_numa_migrate_memory(struct task_struct *p, - struct page *page, int src_nid, int dst_cpu) + struct folio *folio, int src_nid, int dst_cpu) { return true; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb225921bbca..42aefe7e6fdc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1722,12 +1722,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) * The smaller the hint page fault latency, the higher the possibility * for the page to be hot. */ -static int numa_hint_fault_latency(struct page *page) +static int numa_hint_fault_latency(struct folio *folio) { int last_time, time; time = jiffies_to_msecs(jiffies); - last_time = xchg_page_access_time(page, time); + last_time = xchg_page_access_time(&folio->page, time); return (time - last_time) & PAGE_ACCESS_TIME_MASK; } @@ -1784,7 +1784,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, } } -bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { struct numa_group *ng = deref_curr_numa_group(p); @@ -1814,16 +1814,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; - latency = numa_hint_fault_latency(page); + latency = numa_hint_fault_latency(folio); if (latency >= th) return false; return !numa_promotion_rate_limit(pgdat, rate_limit, - thp_nr_pages(page)); + folio_nr_pages(folio)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid); if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69c0eac7292c..abd94f4c7f6b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2642,7 +2642,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, &folio->page, curnid, + if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } -- cgit v1.2.3 From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:51 +0100 Subject: mm: make __access_remote_vm() static Patch series "various improvements to the GUP interface", v2. A series of fixes to simplify and improve the GUP interface with an eye to providing groundwork to future improvements:- * __access_remote_vm() and access_remote_vm() are functionally identical, so make the former static such that in future we can potentially change the external-facing implementation details of this function. * Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and simplify things by defining INTERNAL_GUP_FLAGS to check against. * Adjust __get_user_pages_locked() to explicitly treat a failure to pin any pages as an error in all circumstances other than FOLL_NOWAIT being specified, bringing it in line with the nommu implementation of this function. * (With many thanks to Arnd who suggested this in the first instance) Update get_user_page_vma_remote() to explicitly only return a page or an error, simplifying the interface and avoiding the questionable IS_ERR_OR_NULL() pattern. This patch (of 4): access_remote_vm() passes through parameters to __access_remote_vm() directly, so remove the __access_remote_vm() function from mm.h and use access_remote_vm() in the one caller that needs it (ptrace_access_vm()). This allows future adjustments to the GUP-internal __access_remote_vm() function while keeping the access_remote_vm() function stable. Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- kernel/ptrace.c | 2 +- mm/memory.c | 4 ++-- mm/nommu.c | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 52c40b3d0813..7b89f7bd420d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2415,8 +2415,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 443057bee87c..d8b5e13a2229 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, return 0; } - ret = __access_remote_vm(mm, addr, buf, len, gup_flags); + ret = access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; diff --git a/mm/memory.c b/mm/memory.c index 22784d3b886d..e47c36c0aef0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5791,8 +5791,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys); /* * Access another process' address space as given in mm. */ -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; diff --git a/mm/nommu.c b/mm/nommu.c index 7f9e9e5a0e12..f9553579389b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1651,8 +1651,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; -- cgit v1.2.3 From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:54 +0100 Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL get_user_pages_remote() will never return 0 except in the case of FOLL_NOWAIT being specified, which we explicitly disallow. This simplifies error handling for the caller and avoids the awkwardness of dealing with both errors and failing to pin. Failing to pin here is an error. Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Acked-by: Catalin Marinas Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/kernel/mte.c | 4 ++-- include/linux/mm.h | 12 +++++++++--- kernel/events/uprobes.c | 4 ++-- mm/memory.c | 3 +-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 4edecaac8f91..8878b392df58 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -411,8 +411,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); - if (IS_ERR_OR_NULL(page)) { - err = page == NULL ? -EIO : PTR_ERR(page); + if (IS_ERR(page)) { + err = PTR_ERR(page); break; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b89f7bd420d..fa608cba041f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2425,6 +2425,9 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, int *locked); +/* + * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. + */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, @@ -2432,12 +2435,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, { struct page *page; struct vm_area_struct *vma; - int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); + int got; + + if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) + return ERR_PTR(-EINVAL); + + got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); - if (got == 0) - return NULL; vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3048589e2e85..435aac1d8c27 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,8 +474,8 @@ retry: gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR_OR_NULL(old_page)) - return old_page ? PTR_ERR(old_page) : 0; + if (IS_ERR(old_page)) + return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) diff --git a/mm/memory.c b/mm/memory.c index e47c36c0aef0..1f88e4f6fbf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5815,7 +5815,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); - if (IS_ERR_OR_NULL(page)) { + if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { @@ -5829,7 +5829,6 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, continue; } - /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. -- cgit v1.2.3 From 247dbcdbf790c52fc76cf8e327cd0a5778e41e66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:07 +0100 Subject: bitops: add xor_unlock_is_negative_byte() Replace clear_bit_and_unlock_is_negative_byte() with xor_unlock_is_negative_byte(). We have a few places that like to lock a folio, set a flag and unlock it again. Allow for the possibility of combining the latter two operations for efficiency. We are guaranteed that the caller holds the lock, so it is safe to unlock it with the xor. The caller must guarantee that nobody else will set the flag without holding the lock; it is not safe to do this with the PG_dirty flag, for example. Link: https://lkml.kernel.org/r/20231004165317.1061855-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/bitops.h | 17 +++++----------- arch/x86/include/asm/bitops.h | 11 +++++------ include/asm-generic/bitops/instrumented-lock.h | 27 ++++++++++++++------------ include/asm-generic/bitops/lock.h | 21 +++++--------------- kernel/kcsan/kcsan_test.c | 8 ++++---- kernel/kcsan/selftest.c | 8 ++++---- mm/filemap.c | 5 +++++ mm/kasan/kasan_test.c | 7 ++++--- 8 files changed, 47 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 7e0f0322912b..40cc3ded60cb 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -234,32 +234,25 @@ static inline int arch_test_and_change_bit(unsigned long nr, } #ifdef CONFIG_PPC64 -static inline unsigned long -clear_bit_unlock_return_word(int nr, volatile unsigned long *addr) +static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) { unsigned long old, t; - unsigned long *p = (unsigned long *)addr + BIT_WORD(nr); - unsigned long mask = BIT_MASK(nr); __asm__ __volatile__ ( PPC_RELEASE_BARRIER "1:" PPC_LLARX "%0,0,%3,0\n" - "andc %1,%0,%2\n" + "xor %1,%0,%2\n" PPC_STLCX "%1,0,%3\n" "bne- 1b\n" : "=&r" (old), "=&r" (t) : "r" (mask), "r" (p) : "cc", "memory"); - return old; + return (old & BIT_MASK(7)) != 0; } -/* - * This is a special function for mm/filemap.c - * Bit 7 corresponds to PG_waiters. - */ -#define arch_clear_bit_unlock_is_negative_byte(nr, addr) \ - (clear_bit_unlock_return_word(nr, addr) & BIT_MASK(7)) +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte #endif /* CONFIG_PPC64 */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 2edf68475fec..f03c0a50ec3a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr) asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool -arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1" + asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) - : "ir" ((char) ~(1 << nr)) : "memory"); + : "iq" ((char)mask) : "memory"); return negative; } -#define arch_clear_bit_unlock_is_negative_byte \ - arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h index eb64bd4f11f3..e8ea3aeda9a9 100644 --- a/include/asm-generic/bitops/instrumented-lock.h +++ b/include/asm-generic/bitops/instrumented-lock.h @@ -58,27 +58,30 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) return arch_test_and_set_bit_lock(nr, addr); } -#if defined(arch_clear_bit_unlock_is_negative_byte) +#if defined(arch_xor_unlock_is_negative_byte) /** - * clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom - * byte is negative, for unlock. - * @nr: the bit to clear - * @addr: the address to start counting from + * xor_unlock_is_negative_byte - XOR a single byte in memory and test if + * it is negative, for unlock. + * @mask: Change the bits which are set in this mask. + * @addr: The address of the word containing the byte to change. * + * Changes some of bits 0-6 in the word pointed to by @addr. * This operation is atomic and provides release barrier semantics. + * Used to optimise some folio operations which are commonly paired + * with an unlock or end of writeback. Bit 7 is used as PG_waiters to + * indicate whether anybody is waiting for the unlock. * - * This is a bit of a one-trick-pony for the filemap code, which clears - * PG_locked and tests PG_waiters, + * Return: Whether the top bit of the byte is set. */ -static inline bool -clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static inline bool xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { kcsan_release(); - instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); - return arch_clear_bit_unlock_is_negative_byte(nr, addr); + instrument_atomic_write(addr, sizeof(long)); + return arch_xor_unlock_is_negative_byte(mask, addr); } /* Let everybody know we have it. */ -#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte +#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte #endif #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */ diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index 40913516e654..6a638e89d130 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -66,27 +66,16 @@ arch___clear_bit_unlock(unsigned int nr, volatile unsigned long *p) raw_atomic_long_set_release((atomic_long_t *)p, old); } -/** - * arch_clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom - * byte is negative, for unlock. - * @nr: the bit to clear - * @addr: the address to start counting from - * - * This is a bit of a one-trick-pony for the filemap code, which clears - * PG_locked and tests PG_waiters, - */ -#ifndef arch_clear_bit_unlock_is_negative_byte -static inline bool arch_clear_bit_unlock_is_negative_byte(unsigned int nr, - volatile unsigned long *p) +#ifndef arch_xor_unlock_is_negative_byte +static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) { long old; - unsigned long mask = BIT_MASK(nr); - p += BIT_WORD(nr); - old = raw_atomic_long_fetch_andnot_release(mask, (atomic_long_t *)p); + old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p); return !!(old & BIT(7)); } -#define arch_clear_bit_unlock_is_negative_byte arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte #endif #include diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 0ddbdab5903d..1333d23ac4ef 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -700,10 +700,10 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); -#ifdef clear_bit_unlock_is_negative_byte - KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); - KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); - KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); +#ifdef xor_unlock_is_negative_byte + KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); #endif kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 8679322450f2..619be7417420 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -228,10 +228,10 @@ static bool __init test_barrier(void) spin_lock(&test_spinlock); KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); -#ifdef clear_bit_unlock_is_negative_byte - KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); - KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); - KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); +#ifdef xor_unlock_is_negative_byte + KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); + KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); #endif kcsan_nestable_atomic_end(); diff --git a/mm/filemap.c b/mm/filemap.c index 1ce78c619294..c637863f4643 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1482,6 +1482,11 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) } EXPORT_SYMBOL_GPL(folio_add_wait_queue); +#ifdef xor_unlock_is_negative_byte +#define clear_bit_unlock_is_negative_byte(nr, p) \ + xor_unlock_is_negative_byte(1 << nr, p) +#endif + #ifndef clear_bit_unlock_is_negative_byte /* diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 2030c7ff7de9..821c9ea0473a 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1099,9 +1099,10 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); -#if defined(clear_bit_unlock_is_negative_byte) - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = - clear_bit_unlock_is_negative_byte(nr, addr)); +#if defined(xor_unlock_is_negative_byte) + if (nr < 7) + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = + xor_unlock_is_negative_byte(1 << nr, addr)); #endif } -- cgit v1.2.3 From f12fb73b74fd23ca33e3f95fb996f295eeae1da7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:14 +0100 Subject: mm: delete checks for xor_unlock_is_negative_byte() Architectures which don't define their own use the one in asm-generic/bitops/lock.h. Get rid of all the ifdefs around "maybe we don't have it". Link: https://lkml.kernel.org/r/20231004165317.1061855-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Geert Uytterhoeven Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/alpha/include/asm/bitops.h | 1 - arch/m68k/include/asm/bitops.h | 1 - arch/mips/include/asm/bitops.h | 1 - arch/riscv/include/asm/bitops.h | 1 - include/asm-generic/bitops/instrumented-lock.h | 5 ----- include/asm-generic/bitops/lock.h | 1 - kernel/kcsan/kcsan_test.c | 3 --- kernel/kcsan/selftest.c | 3 --- mm/filemap.c | 30 +------------------------- mm/kasan/kasan_test.c | 3 --- 10 files changed, 1 insertion(+), 48 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index b50ad6b83e85..3e33621922c3 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -305,7 +305,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, return (old & BIT(7)) != 0; } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte /* * ffz = Find First Zero in word. Undefined if no zero exists, diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 80ee36095905..14c64a6f1217 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -339,7 +339,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, return result; #endif } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte /* * The true 68020 and more advanced processors support the "bfffo" diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index d98a05c478f4..89f73d1a4ea4 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -301,7 +301,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, return res; } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte #undef __bit_op #undef __test_bit_op diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 15e3044298a2..65f6eee4ab8d 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -202,7 +202,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, : "memory"); return (res & BIT(7)) != 0; } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte #undef __test_and_op_bit #undef __op_bit diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h index e8ea3aeda9a9..542d3727ee4e 100644 --- a/include/asm-generic/bitops/instrumented-lock.h +++ b/include/asm-generic/bitops/instrumented-lock.h @@ -58,7 +58,6 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) return arch_test_and_set_bit_lock(nr, addr); } -#if defined(arch_xor_unlock_is_negative_byte) /** * xor_unlock_is_negative_byte - XOR a single byte in memory and test if * it is negative, for unlock. @@ -80,8 +79,4 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, instrument_atomic_write(addr, sizeof(long)); return arch_xor_unlock_is_negative_byte(mask, addr); } -/* Let everybody know we have it. */ -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte -#endif - #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */ diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index 6a638e89d130..14d4ec8c5152 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -75,7 +75,6 @@ static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p); return !!(old & BIT(7)); } -#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte #endif #include diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 1333d23ac4ef..015586217875 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -699,12 +699,9 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true); KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); - -#ifdef xor_unlock_is_negative_byte KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); -#endif kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 619be7417420..84a1200271af 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -227,12 +227,9 @@ static bool __init test_barrier(void) KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); spin_lock(&test_spinlock); KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); - -#ifdef xor_unlock_is_negative_byte KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); -#endif kcsan_nestable_atomic_end(); return ret; diff --git a/mm/filemap.c b/mm/filemap.c index c637863f4643..458377e9a184 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1482,34 +1482,6 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) } EXPORT_SYMBOL_GPL(folio_add_wait_queue); -#ifdef xor_unlock_is_negative_byte -#define clear_bit_unlock_is_negative_byte(nr, p) \ - xor_unlock_is_negative_byte(1 << nr, p) -#endif - -#ifndef clear_bit_unlock_is_negative_byte - -/* - * PG_waiters is the high bit in the same byte as PG_lock. - * - * On x86 (and on many other architectures), we can clear PG_lock and - * test the sign bit at the same time. But if the architecture does - * not support that special operation, we just do this all by hand - * instead. - * - * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unnecessary since it is - * in the same byte as PG_locked. - */ -static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) -{ - clear_bit_unlock(nr, mem); - /* smp_mb__after_atomic(); */ - return test_bit(PG_waiters, mem); -} - -#endif - /** * folio_unlock - Unlock a locked folio. * @folio: The folio. @@ -1525,7 +1497,7 @@ void folio_unlock(struct folio *folio) BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) + if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0))) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 821c9ea0473a..8281eb42464b 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1098,12 +1098,9 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); - -#if defined(xor_unlock_is_negative_byte) if (nr < 7) KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = xor_unlock_is_negative_byte(1 << nr, addr)); -#endif } static void kasan_bitops_generic(struct kunit *test) -- cgit v1.2.3 From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:28 -0700 Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etc. fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch rectifies this issue by charging the memcg when the hugetlb folio is utilized, and uncharging when the folio is freed (analogous to the hugetlb controller). Note that we do not charge when the folio is allocated to the hugetlb pool, because at this point it is not owned by any memcg. Some caveats to consider: * This feature is only available on cgroup v2. * There is no hugetlb pool management involved in the memory controller. As stated above, hugetlb folios are only charged towards the memory controller when it is used. Host overcommit management has to consider it when configuring hard limits. * Failure to charge towards the memcg results in SIGBUS. This could happen even if the hugetlb pool still has pages (but the cgroup limit is hit and reclaim attempt fails). * When this feature is enabled, hugetlb pages contribute to memory reclaim protection. low, min limits tuning must take into account hugetlb memory. * Hugetlb pages utilized while this option is not selected will not be tracked by the memory controller (even if cgroup v2 is remounted later on). Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 29 +++++++++++++++++++++++ include/linux/cgroup-defs.h | 5 ++++ include/linux/memcontrol.h | 9 +++++++ kernel/cgroup/cgroup.c | 15 +++++++++++- mm/hugetlb.c | 35 +++++++++++++++++++++------ mm/memcontrol.c | 42 ++++++++++++++++++++++++++++++++- mm/migrate.c | 3 +-- 7 files changed, 127 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 622a7f28db1f..606b2e0eac4b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options. relying on the original semantics (e.g. specifying bogusly high 'bypass' protection values at higher tree levels). + memory_hugetlb_accounting + Count HugeTLB memory usage towards the cgroup's overall + memory usage for the memory controller (for the purpose of + statistics reporting and memory protetion). This is a new + behavior that could regress existing setups, so it must be + explicitly opted in with this mount option. + + A few caveats to keep in mind: + + * There is no HugeTLB pool management involved in the memory + controller. The pre-allocated pool does not belong to anyone. + Specifically, when a new HugeTLB folio is allocated to + the pool, it is not accounted for from the perspective of the + memory controller. It is only charged to a cgroup when it is + actually used (for e.g at page fault time). Host memory + overcommit management has to consider this when configuring + hard limits. In general, HugeTLB pool management should be + done via other mechanisms (such as the HugeTLB controller). + * Failure to charge a HugeTLB folio to the memory controller + results in SIGBUS. This could happen even if the HugeTLB pool + still has pages available (but the cgroup limit is hit and + reclaim attempt fails). + * Charging HugeTLB memory towards the memory controller affects + memory protection and reclaim dynamics. Any userspace tuning + (of low, min limits for e.g) needs to take this into account. + * HugeTLB pages utilized while this option is not selected + will not be tracked by the memory controller (even if cgroup + v2 is remounted later on). + Organizing Processes and Threads -------------------------------- diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f1b3151ac30b..8641f4320c98 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -115,6 +115,11 @@ enum { * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), + + /* + * Enable hugetlb accounting for the memory controller. + */ + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), }; /* cftype->flags */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3fce570122e..6674c12725d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } +static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, + gfp_t gfp, long nr_pages) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..f11488b18ceb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1902,6 +1902,7 @@ enum cgroup2_param { Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, nr__cgroup2_params }; @@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), {} }; @@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; } return -EINVAL; } @@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; } } @@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); return 0; } @@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e2b1c417b90a..da6f85b7db88 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; + long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; - int ret, idx; + int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mem_cgroup *memcg; bool deferred_reserve; + gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + + memcg = get_mem_cgroup_from_current(); + memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); + if (memcg_charge_ret == -ENOMEM) { + mem_cgroup_put(memcg); + return ERR_PTR(-ENOMEM); + } idx = hstate_index(h); /* @@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) + if (map_chg < 0) { + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); + } /* * Processes that did not create the mapping will have no @@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); - } + if (gbl_chg < 0) + goto out_end_reservation; /* * Even though there was no reservation in the region/reserve @@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } + + if (!memcg_charge_ret) + mem_cgroup_commit_charge(folio, memcg); + mem_cgroup_put(memcg); + return folio; out_uncharge_cgroup: @@ -3147,7 +3164,11 @@ out_uncharge_cgroup_reservation: out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_end_reservation: vma_end_reservation(h, vma, addr); + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 135d6637afe5..a86e7b445800 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7096,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) return ret; } +/** + * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio + * @memcg: memcg to charge. + * @gfp: reclaim mode. + * @nr_pages: number of pages to charge. + * + * This function is called when allocating a huge page folio to determine if + * the memcg has the capacity for it. It does not commit the charge yet, + * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * + * Once we have obtained the hugetlb folio, we can call + * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the + * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect + * of try_charge(). + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages) +{ + /* + * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, + * but do not attempt to commit charge later (or cancel on error) either. + */ + if (mem_cgroup_disabled() || !memcg || + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || + !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + return -EOPNOTSUPP; + + if (try_charge(memcg, gfp, nr_pages)) + return -ENOMEM; + + return 0; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. @@ -7365,7 +7400,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) return; memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); + /* + * Note that it is normal to see !memcg for a hugetlb folio. + * For e.g, itt could have been allocated when memory_hugetlb_accounting + * was not selected. + */ + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); if (!memcg) return; diff --git a/mm/migrate.c b/mm/migrate.c index 9d9eee5322d1..c602bf6dec97 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_copy_owner(newfolio, folio); - if (!folio_test_hugetlb(folio)) - mem_cgroup_migrate(folio, newfolio); + mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); -- cgit v1.2.3 From 37acade0ce8938f00d6979bd02b8043b5b7089ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 10 Oct 2023 04:58:29 +0100 Subject: sched: remove wait bookmarks There are no users of wait bookmarks left, so simplify the wait code by removing them. Link: https://lkml.kernel.org/r/20231010035829.544242-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ingo Molnar Cc: Benjamin Segall Cc: Bin Lai Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Andrew Morton --- include/linux/wait.h | 9 +++----- kernel/sched/wait.c | 60 +++++++++------------------------------------------- 2 files changed, 13 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 5ec7739400f4..3473b663176f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 -#define WQ_FLAG_BOOKMARK 0x04 -#define WQ_FLAG_CUSTOM 0x08 -#define WQ_FLAG_DONE 0x10 -#define WQ_FLAG_PRIORITY 0x20 +#define WQ_FLAG_CUSTOM 0x04 +#define WQ_FLAG_DONE 0x08 +#define WQ_FLAG_PRIORITY 0x10 /* * A single wait-queue entry structure: @@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, - unsigned int mode, void *key, wait_queue_entry_t *bookmark); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 802d98cf2de3..51e38f5f4701 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -57,13 +57,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry } EXPORT_SYMBOL(remove_wait_queue); -/* - * Scan threshold to break wait queue walk. - * This allows a waker to take a break from holding the - * wait queue lock during the wait queue walk. - */ -#define WAITQUEUE_WALK_BREAK_CNT 64 - /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve @@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key, - wait_queue_entry_t *bookmark) + int nr_exclusive, int wake_flags, void *key) { wait_queue_entry_t *curr, *next; - int cnt = 0; lockdep_assert_held(&wq_head->lock); - if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { - curr = list_next_entry(bookmark, entry); - - list_del(&bookmark->entry); - bookmark->flags = 0; - } else - curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); if (&curr->entry == &wq_head->head) return nr_exclusive; @@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, unsigned flags = curr->flags; int ret; - if (flags & WQ_FLAG_BOOKMARK) - continue; - ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; - - if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && - (&next->entry != &wq_head->head)) { - bookmark->flags = WQ_FLAG_BOOKMARK; - list_add_tail(&bookmark->entry, &next->entry); - break; - } } return nr_exclusive; @@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m int nr_exclusive, int wake_flags, void *key) { unsigned long flags; - wait_queue_entry_t bookmark; - int remaining = nr_exclusive; + int remaining; - bookmark.flags = 0; - bookmark.private = NULL; - bookmark.func = NULL; - INIT_LIST_HEAD(&bookmark.entry); - - do { - spin_lock_irqsave(&wq_head->lock, flags); - remaining = __wake_up_common(wq_head, mode, remaining, - wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - } while (bookmark.flags & WQ_FLAG_BOOKMARK); + spin_lock_irqsave(&wq_head->lock, flags); + remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, + key); + spin_unlock_irqrestore(&wq_head->lock, flags); return nr_exclusive - remaining; } @@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key, NULL); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, - unsigned int mode, void *key, wait_queue_entry_t *bookmark) -{ - __wake_up_common(wq_head, mode, 1, 0, key, bookmark); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); - /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue @@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); + __wake_up_common(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); -- cgit v1.2.3 From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:28 +0100 Subject: mm: drop the assumption that VM_SHARED always implies writable Patch series "permit write-sealed memfd read-only shared mappings", v4. The man page for fcntl() describing memfd file seals states the following about F_SEAL_WRITE:- Furthermore, trying to create new shared, writable memory-mappings via mmap(2) will also fail with EPERM. With emphasis on 'writable'. In turns out in fact that currently the kernel simply disallows all new shared memory mappings for a memfd with F_SEAL_WRITE applied, rendering this documentation inaccurate. This matters because users are therefore unable to obtain a shared mapping to a memfd after write sealing altogether, which limits their usefulness. This was reported in the discussion thread [1] originating from a bug report [2]. This is a product of both using the struct address_space->i_mmap_writable atomic counter to determine whether writing may be permitted, and the kernel adjusting this counter when any VM_SHARED mapping is performed and more generally implicitly assuming VM_SHARED implies writable. It seems sensible that we should only update this mapping if VM_MAYWRITE is specified, i.e. whether it is possible that this mapping could at any point be written to. If we do so then all we need to do to permit write seals to function as documented is to clear VM_MAYWRITE when mapping read-only. It turns out this functionality already exists for F_SEAL_FUTURE_WRITE - we can therefore simply adapt this logic to do the same for F_SEAL_WRITE. We then hit a chicken and egg situation in mmap_region() where the check for VM_MAYWRITE occurs before we are able to clear this flag. To work around this, perform this check after we invoke call_mmap(), with careful consideration of error paths. Thanks to Andy Lutomirski for the suggestion! [1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/ [2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238 This patch (of 3): There is a general assumption that VMAs with the VM_SHARED flag set are writable. If the VM_MAYWRITE flag is not set, then this is simply not the case. Update those checks which affect the struct address_space->i_mmap_writable field to explicitly test for this by introducing [vma_]is_shared_maywrite() helper functions. This remains entirely conservative, as the lack of VM_MAYWRITE guarantees that the VMA cannot be written to. Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Andy Lutomirski Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/fs.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ kernel/fork.c | 2 +- mm/filemap.c | 2 +- mm/madvise.c | 2 +- mm/mmap.c | 12 ++++++------ 6 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/fs.h b/include/linux/fs.h index fd539c9fef8e..5265186da0e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops; * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. - * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. @@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping) /* * Might pages of this file have been modified in userspace? - * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap + * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * diff --git a/include/linux/mm.h b/include/linux/mm.h index 1bddb151cd5c..c3b5749ede9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(vma->vm_flags); +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/kernel/fork.c b/kernel/fork.c index e45a4457ba83..1e6c656e0857 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ diff --git a/mm/filemap.c b/mm/filemap.c index 9ef49255f1a5..9710f43a89ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } diff --git a/mm/madvise.c b/mm/madvise.c index a6b48d4796ba..cf4d694280e9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma, return -EINVAL; } - if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) diff --git a/mm/mmap.c b/mm/mmap.c index 3ea52451623b..0041e3631f6c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -2846,7 +2846,7 @@ cannot_expand: vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_SHARED) { + if (is_shared_maywrite(vm_flags)) { error = mapping_map_writable(file->f_mapping); if (error) goto free_vma; @@ -2920,7 +2920,7 @@ cannot_expand: mm->map_count++; if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(vma->vm_file->f_mapping); flush_dcache_mmap_lock(vma->vm_file->f_mapping); @@ -2937,7 +2937,7 @@ cannot_expand: /* Once vma denies write, undo our temporary denial count */ unmap_writable: - if (file && vm_flags & VM_SHARED) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); @@ -2985,7 +2985,7 @@ unmap_and_free_vma: unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } - if (file && (vm_flags & VM_SHARED)) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit v1.2.3 From 0b201c3624ae9f58ebfff8484f304f3008fb01b8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:55 +0800 Subject: sched/fair: use folio_xchg_access_time() in numa_hint_fault_latency() Convert to use folio_xchg_access_time() in numa_hint_fault_latency(). Link: https://lkml.kernel.org/r/20231018140806.2783514-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42aefe7e6fdc..3ee9d3564c20 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1727,7 +1727,7 @@ static int numa_hint_fault_latency(struct folio *folio) int last_time, time; time = jiffies_to_msecs(jiffies); - last_time = xchg_page_access_time(&folio->page, time); + last_time = folio_xchg_access_time(folio, time); return (time - last_time) & PAGE_ACCESS_TIME_MASK; } -- cgit v1.2.3 From 1b143cc77f2074dd43b610d6bfffc822d20b6e16 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:00 +0800 Subject: sched/fair: use folio_xchg_last_cpupid() in should_numa_migrate_memory() Convert to use folio_xchg_last_cpupid() in should_numa_migrate_memory(). Link: https://lkml.kernel.org/r/20231018140806.2783514-14-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ee9d3564c20..d1a765cdf6e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1823,7 +1823,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid); + last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) -- cgit v1.2.3