diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-19 17:55:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-19 17:55:45 -0700 |
commit | cb0856346a60fe3eb837ba5e73588a41f81ac05f (patch) | |
tree | 0445b08a97f1a3d9791095cfad1e021ba6b228bc | |
parent | 23990b1affd2dc8f5e59048d4d4bef05f6e1c544 (diff) | |
parent | ef832747a82dfbc22a3702219cc716f449b24e4a (diff) | |
download | lwn-cb0856346a60fe3eb837ba5e73588a41f81ac05f.tar.gz lwn-cb0856346a60fe3eb837ba5e73588a41f81ac05f.zip |
Merge tag 'mm-hotfixes-stable-2023-04-19-16-36' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc fixes from Andrew Morton:
"22 hotfixes.
19 are cc:stable and the remainder address issues which were
introduced during this merge cycle, or aren't considered suitable for
-stable backporting.
19 are for MM and the remainder are for other subsystems"
* tag 'mm-hotfixes-stable-2023-04-19-16-36' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (22 commits)
nilfs2: initialize unused bytes in segment summary blocks
mm: page_alloc: skip regions with hugetlbfs pages when allocating 1G pages
mm/mmap: regression fix for unmapped_area{_topdown}
maple_tree: fix mas_empty_area() search
maple_tree: make maple state reusable after mas_empty_area_rev()
mm: kmsan: handle alloc failures in kmsan_ioremap_page_range()
mm: kmsan: handle alloc failures in kmsan_vmap_pages_range_noflush()
tools/Makefile: do missed s/vm/mm/
mm: fix memory leak on mm_init error handling
mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock
kernel/sys.c: fix and improve control flow in __sys_setres[ug]id()
Revert "userfaultfd: don't fail on unrecognized features"
writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs
maple_tree: fix a potential memory leak, OOB access, or other unpredictable bug
tools/mm/page_owner_sort.c: fix TGID output when cull=tg is used
mailmap: update jtoppins' entry to reference correct email
mm/mempolicy: fix use-after-free of VMA iterator
mm/huge_memory.c: warn with pr_warn_ratelimited instead of VM_WARN_ON_ONCE_FOLIO
mm/mprotect: fix do_mprotect_pkey() return on error
mm/khugepaged: check again on anon uffd-wp during isolation
...
-rw-r--r-- | .mailmap | 2 | ||||
-rw-r--r-- | fs/fs-writeback.c | 17 | ||||
-rw-r--r-- | fs/nilfs2/segment.c | 20 | ||||
-rw-r--r-- | fs/userfaultfd.c | 6 | ||||
-rw-r--r-- | include/linux/kmsan.h | 39 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/sys.c | 69 | ||||
-rw-r--r-- | lib/maple_tree.c | 66 | ||||
-rw-r--r-- | mm/backing-dev.c | 12 | ||||
-rw-r--r-- | mm/huge_memory.c | 19 | ||||
-rw-r--r-- | mm/khugepaged.c | 4 | ||||
-rw-r--r-- | mm/kmsan/hooks.c | 55 | ||||
-rw-r--r-- | mm/kmsan/shadow.c | 27 | ||||
-rw-r--r-- | mm/mempolicy.c | 104 | ||||
-rw-r--r-- | mm/mmap.c | 48 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 19 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 10 | ||||
-rw-r--r-- | tools/Makefile | 14 | ||||
-rw-r--r-- | tools/mm/page_owner_sort.c | 2 |
21 files changed, 351 insertions, 187 deletions
@@ -232,6 +232,8 @@ Johan Hovold <johan@kernel.org> <johan@hovoldconsulting.com> John Crispin <john@phrozen.org> <blogic@openwrt.org> John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> John Stultz <johnstul@us.ibm.com> +<jon.toppins+linux@gmail.com> <jtoppins@cumulusnetworks.com> +<jon.toppins+linux@gmail.com> <jtoppins@redhat.com> Jordan Crouse <jordan@cosmicpenguin.net> <jcrouse@codeaurora.org> <josh@joshtriplett.org> <josh@freedesktop.org> <josh@joshtriplett.org> <josh@kernel.org> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 195dc23e0d83..1db3e3c24b43 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -978,6 +978,16 @@ restart: continue; } + /* + * If wb_tryget fails, the wb has been shutdown, skip it. + * + * Pin @wb so that it stays on @bdi->wb_list. This allows + * continuing iteration from @wb after dropping and + * regrabbing rcu read lock. + */ + if (!wb_tryget(wb)) + continue; + /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; @@ -986,13 +996,6 @@ restart: work->done = &fallback_work_done; wb_queue_work(wb, work); - - /* - * Pin @wb so that it stays on @bdi->wb_list. This allows - * continuing iteration from @wb after dropping and - * regrabbing rcu read lock. - */ - wb_get(wb); last_wb = wb; rcu_read_unlock(); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6ad41390fa74..228659612c0d 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -430,6 +430,23 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) return 0; } +/** + * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area + * @sci: segment constructor object + * + * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of + * the current segment summary block. + */ +static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci) +{ + struct nilfs_segsum_pointer *ssp; + + ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr; + if (ssp->offset < ssp->bh->b_size) + memset(ssp->bh->b_data + ssp->offset, 0, + ssp->bh->b_size - ssp->offset); +} + static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) { sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; @@ -438,6 +455,7 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) * The current segment is filled up * (internal code) */ + nilfs_segctor_zeropad_segsum(sci); sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); return nilfs_segctor_reset_segment_buffer(sci); } @@ -542,6 +560,7 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, goto retry; } if (unlikely(required)) { + nilfs_segctor_zeropad_segsum(sci); err = nilfs_segbuf_extend_segsum(segbuf); if (unlikely(err)) goto failed; @@ -1533,6 +1552,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } + nilfs_segctor_zeropad_segsum(sci); nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); return 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 44d1ee429eb0..40f9e1a2ebdd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1955,8 +1955,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - /* Ignore unsupported features (userspace built against newer kernel) */ - features = uffdio_api.features & UFFD_API_FEATURES; + features = uffdio_api.features; + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e38ae3c34618..30b17647ce3c 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -134,11 +134,12 @@ void kmsan_kfree_large(const void *ptr); * @page_shift: page_shift passed to vmap_range_noflush(). * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in - * vmalloc metadata address range. + * vmalloc metadata address range. Returns 0 on success, callers must check + * for non-zero return value. */ -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift); +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -159,11 +160,12 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); * @page_shift: page_shift argument passed to vmap_range_noflush(). * * KMSAN creates new metadata pages for the physical pages mapped into the - * virtual memory. + * virtual memory. Returns 0 on success, callers must check for non-zero return + * value. */ -void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift); +int kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); /** * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. @@ -281,12 +283,13 @@ static inline void kmsan_kfree_large(const void *ptr) { } -static inline void kmsan_vmap_pages_range_noflush(unsigned long start, - unsigned long end, - pgprot_t prot, - struct page **pages, - unsigned int page_shift) +static inline int kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) { + return 0; } static inline void kmsan_vunmap_range_noflush(unsigned long start, @@ -294,12 +297,12 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start, { } -static inline void kmsan_ioremap_page_range(unsigned long start, - unsigned long end, - phys_addr_t phys_addr, - pgprot_t prot, - unsigned int page_shift) +static inline int kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { + return 0; } static inline void kmsan_iounmap_page_range(unsigned long start, diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..ea332319dffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: diff --git a/kernel/sys.c b/kernel/sys.c index 495cd87d9bf4..351de7916302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -664,6 +664,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) struct cred *new; int retval; kuid_t kruid, keuid, ksuid; + bool ruid_new, euid_new, suid_new; kruid = make_kuid(ns, ruid); keuid = make_kuid(ns, euid); @@ -678,25 +679,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if ((suid != (uid_t) -1) && !uid_valid(ksuid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) && + (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) && + uid_eq(keuid, old->fsuid))) && + (suid == (uid_t) -1 || uid_eq(ksuid, old->suid))) + return 0; + + ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid); + euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid); + suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid); + if ((ruid_new || euid_new || suid_new) && + !ns_capable_setid(old->user_ns, CAP_SETUID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { - if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && - !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) - goto error; - if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && - !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) - goto error; - if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && - !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) - goto error; - } - if (ruid != (uid_t) -1) { new->uid = kruid; if (!uid_eq(kruid, old->uid)) { @@ -761,6 +766,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) struct cred *new; int retval; kgid_t krgid, kegid, ksgid; + bool rgid_new, egid_new, sgid_new; krgid = make_kgid(ns, rgid); kegid = make_kgid(ns, egid); @@ -773,23 +779,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) return -EINVAL; + old = current_cred(); + + /* check for no-op */ + if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) && + (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) && + gid_eq(kegid, old->fsgid))) && + (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid))) + return 0; + + rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid); + egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid); + sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid); + if ((rgid_new || egid_new || sgid_new) && + !ns_capable_setid(old->user_ns, CAP_SETGID)) + return -EPERM; + new = prepare_creds(); if (!new) return -ENOMEM; - old = current_cred(); - - retval = -EPERM; - if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { - if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && - !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) - goto error; - if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && - !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) - goto error; - if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && - !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) - goto error; - } if (rgid != (gid_t) -1) new->gid = krgid; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db60edb55f2f..1281a40d5735 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; @@ -4970,7 +4965,8 @@ not_found: * Return: True if found in a leaf, false otherwise. * */ -static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) +static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, + unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); @@ -5035,8 +5031,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) if (unlikely(ma_is_leaf(type))) { mas->offset = offset; - mas->min = min; - mas->max = min + gap - 1; + *gap_min = min; + *gap_max = min + gap - 1; return true; } @@ -5060,10 +5056,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; - unsigned char offset; - unsigned long *gaps; - unsigned long *pivots = ma_pivots(mas_mn(mas), type); - void __rcu **slots = ma_slots(mas_mn(mas), type); + unsigned char offset, data_end; + unsigned long *gaps, *pivots; + void __rcu **slots; + struct maple_node *node; bool found = false; if (ma_is_dense(type)) { @@ -5071,13 +5067,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) return true; } - gaps = ma_gaps(mte_to_node(mas->node), type); + node = mas_mn(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); - for (; offset < mt_slots[type]; offset++) { - pivot = mas_safe_pivot(mas, pivots, offset, type); - if (offset && !pivot) - break; + data_end = ma_data_end(node, type, pivots, mas->max); + for (; offset <= data_end; offset++) { + pivot = mas_logical_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) @@ -5312,6 +5310,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long *pivots; enum maple_type mt; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) @@ -5366,6 +5367,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, { struct maple_enode *last = mas->node; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) { mas_start(mas); mas->offset = mas_data_end(mas); @@ -5385,7 +5389,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->index = min; mas->last = max; - while (!mas_rev_awalk(mas, size)) { + while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; @@ -5400,17 +5404,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; - /* - * mas_rev_awalk() has set mas->min and mas->max to the gap values. If - * the maximum is outside the window we are searching, then use the last - * location in the search. - * mas->max and mas->min is the range of the gap. - * mas->index and mas->last are currently set to the search range. - */ - /* Trim the upper limit to the max. */ - if (mas->max <= mas->last) - mas->last = mas->max; + if (max <= mas->last) + mas->last = max; mas->index = mas->last - size + 1; return 0; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a53b9360b72e..30d2d0386fdb 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -507,6 +507,15 @@ static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); +static void cgwb_free_rcu(struct rcu_head *rcu_head) +{ + struct bdi_writeback *wb = container_of(rcu_head, + struct bdi_writeback, rcu); + + percpu_ref_exit(&wb->refcnt); + kfree(wb); +} + static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -529,11 +538,10 @@ static void cgwb_release_workfn(struct work_struct *work) list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); - percpu_ref_exit(&wb->refcnt); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); - kfree_rcu(wb, rcu); + call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 032fb0ef9cd1..3fae2d2496ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1838,10 +1838,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct page *page = pfn_swap_entry_to_page(entry); + pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { - pmd_t newpmd; /* * A protection check is difficult so * just be safe and disable write @@ -1855,8 +1855,16 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = pmd_swp_mksoft_dirty(newpmd); if (pmd_swp_uffd_wp(*pmd)) newpmd = pmd_swp_mkuffd_wp(newpmd); - set_pmd_at(mm, addr, pmd, newpmd); + } else { + newpmd = *pmd; } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif @@ -2657,9 +2665,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); is_hzp = is_huge_zero_page(&folio->page); - VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); - if (is_hzp) + if (is_hzp) { + pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; + } if (folio_test_writeback(folio)) return -EBUSY; @@ -3251,6 +3260,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); + if (pmd_uffd_wp(pmdval)) + pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 92e6f56a932d..0ec69b96b497 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -572,6 +572,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_NON_PRESENT; goto out; } + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } page = vm_normal_page(vma, address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 3807502766a3..ec0da72e65aa 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -148,35 +148,74 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) * into the virtual memory. If those physical pages already had shadow/origin, * those are ignored. */ -void kmsan_ioremap_page_range(unsigned long start, unsigned long end, - phys_addr_t phys_addr, pgprot_t prot, - unsigned int page_shift) +int kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) { gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; struct page *shadow, *origin; unsigned long off = 0; - int nr; + int nr, err = 0, clean = 0, mapped; if (!kmsan_enabled || kmsan_in_runtime()) - return; + return 0; nr = (end - start) / PAGE_SIZE; kmsan_enter_runtime(); - for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) { shadow = alloc_pages(gfp_mask, 1); origin = alloc_pages(gfp_mask, 1); - __vmap_pages_range_noflush( + if (!shadow || !origin) { + err = -ENOMEM; + goto ret; + } + mapped = __vmap_pages_range_noflush( vmalloc_shadow(start + off), vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, PAGE_SHIFT); - __vmap_pages_range_noflush( + if (mapped) { + err = mapped; + goto ret; + } + shadow = NULL; + mapped = __vmap_pages_range_noflush( vmalloc_origin(start + off), vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, PAGE_SHIFT); + if (mapped) { + __vunmap_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE)); + err = mapped; + goto ret; + } + origin = NULL; + } + /* Page mapping loop finished normally, nothing to clean up. */ + clean = 0; + +ret: + if (clean > 0) { + /* + * Something went wrong. Clean up shadow/origin pages allocated + * on the last loop iteration, then delete mappings created + * during the previous iterations. + */ + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + __vunmap_range_noflush( + vmalloc_shadow(start), + vmalloc_shadow(start + clean * PAGE_SIZE)); + __vunmap_range_noflush( + vmalloc_origin(start), + vmalloc_origin(start + clean * PAGE_SIZE)); } flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); kmsan_leave_runtime(); + return err; } void kmsan_iounmap_page_range(unsigned long start, unsigned long end) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index a787c04e9583..b8bb95eea5e3 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -216,27 +216,29 @@ void kmsan_free_page(struct page *page, unsigned int order) kmsan_leave_runtime(); } -void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages, - unsigned int page_shift) +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned long shadow_start, origin_start, shadow_end, origin_end; struct page **s_pages, **o_pages; - int nr, mapped; + int nr, mapped, err = 0; if (!kmsan_enabled) - return; + return 0; shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); if (!shadow_start) - return; + return 0; nr = (end - start) / PAGE_SIZE; s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); - if (!s_pages || !o_pages) + if (!s_pages || !o_pages) { + err = -ENOMEM; goto ret; + } for (int i = 0; i < nr; i++) { s_pages[i] = shadow_page_for(pages[i]); o_pages[i] = origin_page_for(pages[i]); @@ -249,10 +251,16 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kmsan_enter_runtime(); mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, s_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, o_pages, page_shift); - KMSAN_WARN_ON(mapped); + if (mapped) { + err = mapped; + goto ret; + } kmsan_leave_runtime(); flush_tlb_kernel_range(shadow_start, shadow_end); flush_tlb_kernel_range(origin_start, origin_end); @@ -262,6 +270,7 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, ret: kfree(s_pages); kfree(o_pages); + return err; } /* Allocate metadata for pages allocated at boot time. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a256a241fd1d..2068b594dc88 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -790,61 +790,50 @@ static int vma_replace_policy(struct vm_area_struct *vma, return err; } -/* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *prev; - struct vm_area_struct *vma; - int err = 0; + struct vm_area_struct *merged; + unsigned long vmstart, vmend; pgoff_t pgoff; + int err; - prev = vma_prev(&vmi); - vma = vma_find(&vmi, end); - if (WARN_ON(!vma)) + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } + + if (mpol_equal(vma_policy(vma), new_pol)) return 0; - if (start > vma->vm_start) - prev = vma; - - do { - unsigned long vmstart = max(start, vma->vm_start); - unsigned long vmend = min(end, vma->vm_end); - - if (mpol_equal(vma_policy(vma), new_pol)) - goto next; - - pgoff = vma->vm_pgoff + - ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); - if (prev) { - vma = prev; - goto replace; - } - if (vma->vm_start != vmstart) { - err = split_vma(&vmi, vma, vmstart, 1); - if (err) - goto out; - } - if (vma->vm_end != vmend) { - err = split_vma(&vmi, vma, vmend, 0); - if (err) - goto out; - } -replace: - err = vma_replace_policy(vma, new_pol); + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + return vma_replace_policy(merged, new_pol); + } + + if (vma->vm_start != vmstart) { + err = split_vma(vmi, vma, vmstart, 1); if (err) - goto out; -next: - prev = vma; - } for_each_vma_range(vmi, vma, end); + return err; + } -out: - return err; + if (vma->vm_end != vmend) { + err = split_vma(vmi, vma, vmend, 0); + if (err) + return err; + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ @@ -1259,6 +1248,8 @@ static long do_mbind(unsigned long start, unsigned long len, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; struct mempolicy *new; unsigned long end; int err; @@ -1328,7 +1319,13 @@ static long do_mbind(unsigned long start, unsigned long len, goto up_out; } - err = mbind_range(mm, start, end, new); + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } if (!err) { int nr_failed = 0; @@ -1489,10 +1486,8 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; - unsigned long vmstart; - unsigned long vmend; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); @@ -1521,6 +1516,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); + prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND @@ -1541,9 +1537,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le } new->home_node = home_node; - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); - err = mbind_range(mm, vmstart, vmend, new); + err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; diff --git a/mm/mmap.c b/mm/mmap.c index ff68a67a2a7c..d5475fbf5729 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1518,7 +1518,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, low_limit; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1527,12 +1528,29 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, - length)) + low_limit = info->low_limit; +retry: + if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length)) return -ENOMEM; gap = mas.index; gap += (info->align_offset - gap) & info->align_mask; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + mas_reset(&mas); + goto retry; + } + } + return gap; } @@ -1548,7 +1566,8 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - unsigned long length, gap; + unsigned long length, gap, high_limit, gap_end; + struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ @@ -1556,12 +1575,31 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; - if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + high_limit = info->high_limit; +retry: + if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; gap -= (gap - info->align_offset) & info->align_mask; + gap_end = mas.last; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) <= gap_end) { + high_limit = vm_start_gap(tmp); + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + mas_reset(&mas); + goto retry; + } + } + return gap; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 13e84d8c0797..36351a00c0e8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (vma_iter_end(&vmi) < end) + if (!error && vma_iter_end(&vmi) < end) error = -ENOMEM; out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7136c36c5d01..8e39705c7bdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6632,7 +6632,21 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; + unsigned long flags; + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA @@ -6671,6 +6685,8 @@ static void __build_all_zonelists(void *data) } write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); } static noinline void __init @@ -9450,6 +9466,9 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; + + if (PageHuge(page)) + return false; } return true; } diff --git a/mm/swap.c b/mm/swap.c index 57cb01b042f6..423199ee8478 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -222,7 +222,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_init(fbatch); + folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a50072066221..31ff782d368b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -313,8 +313,8 @@ int ioremap_page_range(unsigned long addr, unsigned long end, ioremap_max_page_shift); flush_cache_vmap(addr, end); if (!err) - kmsan_ioremap_page_range(addr, end, phys_addr, prot, - ioremap_max_page_shift); + err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -605,7 +605,11 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, + page_shift); + + if (ret) + return ret; return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); } diff --git a/tools/Makefile b/tools/Makefile index e497875fc7e3..37e9f6804832 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -39,7 +39,7 @@ help: @echo ' turbostat - Intel CPU idle stats and freq reporting tool' @echo ' usb - USB testing tools' @echo ' virtio - vhost test module' - @echo ' vm - misc vm tools' + @echo ' mm - misc mm tools' @echo ' wmi - WMI interface examples' @echo ' x86_energy_perf_policy - Intel energy policy tool' @echo '' @@ -69,7 +69,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -cgroup counter firewire hv guest bootconfig spi usb virtio vm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE +cgroup counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -118,7 +118,7 @@ kvm_stat: FORCE all: acpi cgroup counter cpupower gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ - virtio vm bpf x86_energy_perf_policy \ + virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ pci debugging tracing thermal thermometer thermal-engine @@ -128,7 +128,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install vm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: +cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -158,7 +158,7 @@ kvm_stat_install: install: acpi_install cgroup_install counter_install cpupower_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ - virtio_install vm_install bpf_install x86_energy_perf_policy_install \ + virtio_install mm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install \ wmi_install pci_install debugging_install intel-speed-select_install \ tracing_install thermometer_install thermal-engine_install @@ -169,7 +169,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean vm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: +cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -211,7 +211,7 @@ build_clean: clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ - vm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ + mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 7c2ac124cdc8..99798894b879 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -857,7 +857,7 @@ int main(int argc, char **argv) if (cull & CULL_PID || filter & FILTER_PID) fprintf(fout, ", PID %d", list[i].pid); if (cull & CULL_TGID || filter & FILTER_TGID) - fprintf(fout, ", TGID %d", list[i].pid); + fprintf(fout, ", TGID %d", list[i].tgid); if (cull & CULL_COMM || filter & FILTER_COMM) fprintf(fout, ", task_comm_name: %s", list[i].comm); if (cull & CULL_ALLOCATOR) { |