diff options
author | Andrew Morton <akpm@linux-foundation.org> | 2022-09-26 13:13:15 -0700 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2022-09-26 13:13:15 -0700 |
commit | 6d751329e761338faa9c24c2c9736f27bc54282b (patch) | |
tree | 6e1778a7448552c58d12da88f70f6ad2718d55da | |
parent | 088b8aa537c2c767765f1c19b555f21ffe555786 (diff) | |
parent | 59298997df89e19aad426d4ae0a7e5037074da5a (diff) | |
download | lwn-6d751329e761338faa9c24c2c9736f27bc54282b.tar.gz lwn-6d751329e761338faa9c24c2c9736f27bc54282b.zip |
Merge branch 'mm-hotfixes-stable' into mm-stable
-rw-r--r-- | arch/powerpc/mm/book3s64/radix_pgtable.c | 9 | ||||
-rw-r--r-- | arch/x86/lib/usercopy.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 3 | ||||
-rw-r--r-- | fs/ntfs/super.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_notify_failure.c | 6 | ||||
-rw-r--r-- | include/linux/memremap.h | 5 | ||||
-rw-r--r-- | mm/damon/dbgfs.c | 19 | ||||
-rw-r--r-- | mm/frontswap.c | 3 | ||||
-rw-r--r-- | mm/gup.c | 34 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/khugepaged.c | 10 | ||||
-rw-r--r-- | mm/madvise.c | 7 | ||||
-rw-r--r-- | mm/memory-failure.c | 27 | ||||
-rw-r--r-- | mm/memory.c | 14 | ||||
-rw-r--r-- | mm/migrate_device.c | 16 | ||||
-rw-r--r-- | mm/page_alloc.c | 65 | ||||
-rw-r--r-- | mm/page_isolation.c | 25 | ||||
-rw-r--r-- | mm/secretmem.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 4 | ||||
-rw-r--r-- | tools/include/linux/gfp.h | 21 | ||||
-rw-r--r-- | tools/include/linux/gfp_types.h | 1 |
23 files changed, 192 insertions, 106 deletions
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 698274109c91..e712f80fe189 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -937,15 +937,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre pmd = *pmdp; pmd_clear(pmdp); - /* - * pmdp collapse_flush need to ensure that there are no parallel gup - * walk after this call. This is needed so that we can have stable - * page ref count when collapsing a page. We don't allow a collapse page - * if we have gup taken on the page. We can ensure that by sending IPI - * because gup walk happens with IRQ disabled. - */ - serialize_against_pte_lookup(vma->vm_mm); - radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); return pmd; diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index ad0139d25401..f1bb18617156 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) * called from other contexts. */ pagefault_disable(); - ret = __copy_from_user_inatomic(to, from, n); + ret = raw_copy_from_user(to, from, n); pagefault_enable(); return ret; diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f8220fd2c169..829c1409ffbd 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n KCOV_INSTRUMENT_mem_encrypt_amd.o := n KCOV_INSTRUMENT_mem_encrypt_identity.o := n +KCOV_INSTRUMENT_pgprot.o := n KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt_amd.o := n KASAN_SANITIZE_mem_encrypt_identity.o := n +KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions # reference __initdata sections. @@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt_amd.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg +CFLAGS_REMOVE_pgprot.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 5ae8de09b271..001f4e053c85 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2092,7 +2092,8 @@ get_ctx_vol_failed: // TODO: Initialize security. /* Get the extended system files' directory inode. */ vol->extend_ino = ntfs_iget(sb, FILE_Extend); - if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + !S_ISDIR(vol->extend_ino->i_mode)) { if (!IS_ERR(vol->extend_ino)) iput(vol->extend_ino); ntfs_error(sb, "Failed to load $Extend."); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 69d9c83ea4b2..5b1f9a24ed59 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -175,13 +175,13 @@ xfs_dax_notify_failure( u64 ddev_start; u64 ddev_end; - if (!(mp->m_sb.sb_flags & SB_BORN)) { + if (!(mp->m_super->s_flags & SB_BORN)) { xfs_warn(mp, "filesystem is not ready for notify_failure()!"); return -EIO; } if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { - xfs_warn(mp, + xfs_debug(mp, "notify_failure() not supported on realtime device!"); return -EOPNOTSUPP; } @@ -194,7 +194,7 @@ xfs_dax_notify_failure( } if (!xfs_has_rmapbt(mp)) { - xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); return -EOPNOTSUPP; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 19010491a603..c3b4cc84877b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -139,6 +139,11 @@ struct dev_pagemap { }; }; +static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap) +{ + return pgmap->ops && pgmap->ops->memory_failure; +} + static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) { if (pgmap->flags & PGMAP_ALTMAP_VALID) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 3b55a1b219b5..652a94deafe3 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name) struct dentry *root, *dir, **new_dirs; struct damon_ctx **new_ctxs; int i, j; + int ret = 0; if (damon_nr_running_ctxs()) return -EBUSY; @@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name) new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), GFP_KERNEL); - if (!new_dirs) - return -ENOMEM; + if (!new_dirs) { + ret = -ENOMEM; + goto out_dput; + } new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), GFP_KERNEL); if (!new_ctxs) { - kfree(new_dirs); - return -ENOMEM; + ret = -ENOMEM; + goto out_new_dirs; } for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { @@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name) dbgfs_ctxs = new_ctxs; dbgfs_nr_ctxs--; - return 0; + goto out_dput; + +out_new_dirs: + kfree(new_dirs); +out_dput: + dput(dir); + return ret; } static ssize_t dbgfs_rm_context_write(struct file *file, diff --git a/mm/frontswap.c b/mm/frontswap.c index 1a97610308cb..279e55b4ed87 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); + + if (!frontswap_enabled()) + return; frontswap_ops->init(type); } @@ -2394,8 +2394,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL -static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +/* + * Fast-gup relies on pte change detection to avoid concurrent pgtable + * operations. + * + * To pin the page, fast-gup needs to do below in order: + * (1) pin the page (by prefetching pte), then (2) check pte not changed. + * + * For the rest of pgtable operations where pgtable updates can be racy + * with fast-gup, we need to do (1) clear pte, then (2) check whether page + * is pinned. + * + * Above will work for all pte-level operations, including THP split. + * + * For THP collapse, it's a bit more complicated because fast-gup may be + * walking a pgtable page that is being freed (pte is still valid but pmd + * can be cleared already). To avoid race in such condition, we need to + * also check pmd here to make sure pmd doesn't change (corresponds to + * pmdp_collapse_flush() in the THP collapse code path). + */ +static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -2441,7 +2461,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, goto pte_unmap; } - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || + unlikely(pte_val(pte) != pte_val(*ptep))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2488,8 +2509,9 @@ pte_unmap: * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ -static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - unsigned int flags, struct page **pages, int *nr) +static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) { return 0; } @@ -2813,7 +2835,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) + } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f18896c8f9a..7bf2299cb24b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2900,11 +2900,9 @@ static void split_huge_pages_all(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { int nr_pages; - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - if (!get_page_unless_zero(page)) + page = pfn_to_online_page(pfn); + if (!page || !get_page_unless_zero(page)) continue; if (zone != page_zone(page)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 46387208fd9d..2ca4e8c3163e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3421,6 +3421,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) { int i, nid = page_to_nid(page); struct hstate *target_hstate; + struct page *subpage; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); @@ -3454,15 +3455,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { + subpage = nth_page(page, i); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_page_for_demote(page + i, + prep_compound_gigantic_page_for_demote(subpage, target_hstate->order); else - prep_compound_page(page + i, target_hstate->order); - set_page_private(page + i, 0); - set_page_refcounted(page + i); - prep_new_huge_page(target_hstate, page + i, nid); - put_page(page + i); + prep_compound_page(subpage, target_hstate->order); + set_page_private(subpage, 0); + set_page_refcounted(subpage); + prep_new_huge_page(target_hstate, subpage, nid); + put_page(subpage); } mutex_unlock(&target_hstate->resize_lock); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5f7c60b8b269..0bcba493ebb4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1071,10 +1071,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* - * After this gup_fast can't run anymore. This also removes - * any huge TLB entry from the CPU so we won't allow - * huge and small TLB entries for the same virtual address - * to avoid the risk of CPU bugs in that area. + * This removes any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address to + * avoid the risk of CPU bugs in that area. + * + * Parallel fast GUP is fine since fast GUP will back off when + * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); diff --git a/mm/madvise.c b/mm/madvise.c index af97100a0727..4f86eb7f554d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -452,8 +452,11 @@ regular_page: continue; } - /* Do not interfere with other mappings of this page */ - if (page_mapcount(page) != 1) + /* + * Do not interfere with other mappings of this page and + * non-LRU page. + */ + if (!PageLRU(page) || page_mapcount(page) != 1) continue; VM_BUG_ON_PAGE(PageTransCompound(page), page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6dd74f535f77..265378237c22 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -345,13 +345,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ +#define FSDAX_INVALID_PGOFF ULONG_MAX + /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. * - * Notice: @fsdax_pgoff is used only when @p is a fsdax page. - * In other cases, such as anonymous and file-backend page, the address to be - * killed can be caculated by @p itself. + * Note: @fsdax_pgoff is used only when @p is a fsdax page and a + * filesystem with a memory failure handler has claimed the + * memory_failure event. In all other cases, page->index and + * page->mapping are sufficient for mapping the page back to its + * corresponding user virtual address. */ static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t fsdax_pgoff, struct vm_area_struct *vma, @@ -367,11 +371,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, tk->addr = page_address_in_vma(p, vma); if (is_zone_device_page(p)) { - /* - * Since page->mapping is not used for fsdax, we need - * calculate the address based on the vma. - */ - if (p->pgmap->type == MEMORY_DEVICE_FS_DAX) + if (fsdax_pgoff != FSDAX_INVALID_PGOFF) tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); } else @@ -524,7 +524,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, 0, vma, to_kill); + add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, + to_kill); } } read_unlock(&tasklist_lock); @@ -560,7 +561,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, 0, vma, to_kill); + add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, + to_kill); } } read_unlock(&tasklist_lock); @@ -744,6 +746,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, }; priv.tk.tsk = p; + if (!p->mm) + return -EFAULT; + mmap_read_lock(p->mm); ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, (void *)&priv); @@ -1925,7 +1930,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, * Call driver's implementation to handle the memory failure, otherwise * fall back to generic handler. */ - if (pgmap->ops->memory_failure) { + if (pgmap_has_memory_failure(pgmap)) { rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); /* * Fall back to generic handler too if operation is not diff --git a/mm/memory.c b/mm/memory.c index b994784158f5..e38f9245470c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4387,14 +4387,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - ret = 0; + /* Re-check under ptl */ - if (likely(!vmf_pte_changed(vmf))) + if (likely(!vmf_pte_changed(vmf))) { do_set_pte(vmf, page, vmf->address); - else + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, vmf->address, vmf->pte); + + ret = 0; + } else { + update_mmu_tlb(vma, vmf->address, vmf->pte); ret = VM_FAULT_NOPAGE; + } - update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 72354248dde9..d8efd5a0eb40 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/memremap.h> #include <linux/migrate.h> +#include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/oom.h> @@ -200,10 +201,10 @@ again: bool anon_exclusive; pte_t swp_pte; + flush_cache_page(vma, addr, pte_pfn(*ptep)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + pte = ptep_clear_flush(vma, addr, ptep); if (page_try_share_anon_rmap(page)) { set_pte_at(mm, addr, ptep, pte); @@ -213,11 +214,15 @@ again: goto next; } } else { - ptep_get_and_clear(mm, addr, ptep); + pte = ptep_get_and_clear(mm, addr, ptep); } migrate->cpages++; + /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pte)) + folio_mark_dirty(page_folio(page)); + /* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) entry = make_writable_migration_entry( @@ -261,13 +266,14 @@ next: migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = mpfn; } - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(ptep - 1, ptl); /* Only flush the TLB if we actually modified any entries */ if (unmapped) flush_tlb_range(walk->vma, start, end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d49803d46af..262896bd1a90 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4701,6 +4701,30 @@ void fs_reclaim_release(gfp_t gfp_mask) EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + /* Perform direct synchronous page reclaim */ static unsigned long __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -4994,6 +5018,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; int reserve_flags; /* @@ -5004,11 +5029,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; -retry_cpuset: +restart: compaction_retries = 0; no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); /* * The fast path uses conservative alloc_flags to succeed only until @@ -5180,9 +5206,13 @@ retry: goto retry; - /* Deal with possible cpuset update races before we start OOM killing */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); @@ -5202,9 +5232,13 @@ retry: } nopage: - /* Deal with possible cpuset update races before we fail */ - if (check_retry_cpuset(cpuset_mems_cookie, ac)) - goto retry_cpuset; + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure @@ -5699,6 +5733,18 @@ refill: /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } } nc->pagecnt_bias--; @@ -6507,9 +6553,8 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; - static DEFINE_SPINLOCK(lock); - spin_lock(&lock); + write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -6546,7 +6591,7 @@ static void __build_all_zonelists(void *data) #endif } - spin_unlock(&lock); + write_sequnlock(&zonelist_update_seq); } static noinline void __init diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d73dc38e3d7..eb3a68ca92ad 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * @isolate_before: isolate the pageblock before the boundary_pfn * @skip_isolation: the flag to skip the pageblock isolation in second * isolate_single_pageblock() + * @migratetype: migrate type to set in error recovery. * * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same @@ -302,9 +303,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before, bool skip_isolation) + gfp_t gfp_flags, bool isolate_before, bool skip_isolation, + int migratetype) { - unsigned char saved_mt; unsigned long start_pfn; unsigned long isolate_pageblock; unsigned long pfn; @@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES), zone->zone_start_pfn); - saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + if (skip_isolation) { + int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - if (skip_isolation) - VM_BUG_ON(!is_migrate_isolate(saved_mt)); - else { - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, - isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + VM_BUG_ON(!is_migrate_isolate(mt)); + } else { + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype, + flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages); if (ret) return ret; @@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, failed: /* restore the original migratetype */ if (!skip_isolation) - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype); return -EBUSY; } @@ -537,7 +538,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + skip_isolation, migratetype); if (ret) return ret; @@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, skip_isolation = true; /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + skip_isolation, migratetype); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; diff --git a/mm/secretmem.c b/mm/secretmem.c index e3e9590c6fb3..3f7154099795 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -285,7 +285,7 @@ static int secretmem_init(void) secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) - ret = PTR_ERR(secretmem_mnt); + return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; diff --git a/mm/swap_state.c b/mm/swap_state.c index e166051566f4..41afa6d45b23 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio, for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); - VM_BUG_ON_FOLIO(entry != folio, folio); + VM_BUG_ON_PAGE(entry != folio, entry); set_page_private(folio_page(folio, i), 0); xas_next(&xas); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 02e720c83901..c879694b41fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2550,8 +2550,8 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (unlikely(buffer_heads_over_limit)) { - if (folio_get_private(folio) && folio_trylock(folio)) { - if (folio_get_private(folio)) + if (folio_test_private(folio) && folio_trylock(folio)) { + if (folio_test_private(folio)) filemap_release_folio(folio, 0); folio_unlock(folio); } diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h index b238dbc9eb85..6a10ff5f5be9 100644 --- a/tools/include/linux/gfp.h +++ b/tools/include/linux/gfp.h @@ -3,26 +3,7 @@ #define _TOOLS_INCLUDE_LINUX_GFP_H #include <linux/types.h> - -#define __GFP_BITS_SHIFT 26 -#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) - -#define __GFP_HIGH 0x20u -#define __GFP_IO 0x40u -#define __GFP_FS 0x80u -#define __GFP_NOWARN 0x200u -#define __GFP_ZERO 0x8000u -#define __GFP_ATOMIC 0x80000u -#define __GFP_ACCOUNT 0x100000u -#define __GFP_DIRECT_RECLAIM 0x400000u -#define __GFP_KSWAPD_RECLAIM 0x2000000u - -#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM) - -#define GFP_ZONEMASK 0x0fu -#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM) -#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) -#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) +#include <linux/gfp_types.h> static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { diff --git a/tools/include/linux/gfp_types.h b/tools/include/linux/gfp_types.h new file mode 100644 index 000000000000..5f9f1ed190a0 --- /dev/null +++ b/tools/include/linux/gfp_types.h @@ -0,0 +1 @@ +#include "../../../include/linux/gfp_types.h" |