diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-27 11:40:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-27 11:40:49 -0700 |
commit | 8291eaafed36f575f23951f3ce18407f480e9ecf (patch) | |
tree | 279b61422ba2df7b8579af8ccc81331de80affa8 /mm | |
parent | 77fb622de1393b1d54f24f4f7ed98f84feeda502 (diff) | |
parent | fa020a2b87d24016723fff4a4237deb612478a32 (diff) | |
download | lwn-8291eaafed36f575f23951f3ce18407f480e9ecf.tar.gz lwn-8291eaafed36f575f23951f3ce18407f480e9ecf.zip |
Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton:
- Two follow-on fixes for the post-5.19 series "Use pageblock_order for
cma and alloc_contig_range alignment", from Zi Yan.
- A series of z3fold cleanups and fixes from Miaohe Lin.
- Some memcg selftests work from Michal Koutný <mkoutny@suse.com>
- Some swap fixes and cleanups from Miaohe Lin
- Several individual minor fixups
* tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits)
mm/shmem.c: suppress shift warning
mm: Kconfig: reorganize misplaced mm options
mm: kasan: fix input of vmalloc_to_page()
mm: fix is_pinnable_page against a cma page
mm: filter out swapin error entry in shmem mapping
mm/shmem: fix infinite loop when swap in shmem error at swapoff time
mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range
mm/swapfile: fix lost swap bits in unuse_pte()
mm/swapfile: unuse_pte can map random data if swap read fails
selftests: memcg: factor out common parts of memory.{low,min} tests
selftests: memcg: remove protection from top level memcg
selftests: memcg: adjust expected reclaim values of protected cgroups
selftests: memcg: expect no low events in unprotected sibling
selftests: memcg: fix compilation
mm/z3fold: fix z3fold_page_migrate races with z3fold_map
mm/z3fold: fix z3fold_reclaim_page races with z3fold_free
mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock
mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails
revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc"
mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 56 | ||||
-rw-r--r-- | mm/Kconfig.debug | 33 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kasan/report.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 18 | ||||
-rw-r--r-- | mm/memory.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 32 | ||||
-rw-r--r-- | mm/page_isolation.c | 36 | ||||
-rw-r--r-- | mm/shmem.c | 41 | ||||
-rw-r--r-- | mm/swap_state.c | 3 | ||||
-rw-r--r-- | mm/swapfile.c | 21 | ||||
-rw-r--r-- | mm/z3fold.c | 97 |
12 files changed, 261 insertions, 87 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 905c205e14f3..169e64192e48 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED sanity-checking than others. This option is most effective with CONFIG_SLUB. +config SLUB_STATS + default n + bool "Enable SLUB performance statistics" + depends on SLUB && SYSFS + help + SLUB statistics are useful to debug SLUBs allocation behavior in + order find ways to optimize the allocator. This should never be + enabled for production use since keeping statistics slows down + the allocator by a few percentage points. The slabinfo command + supports the determination of the most active slabs to figure + out which slabs are relevant to a particular load. + Try running: slabinfo -DA + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP @@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR Say Y if unsure. +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has its contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. + config SELECT_MEMORY_MODEL def_bool y depends on ARCH_SELECT_MEMORY_MODEL @@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + config PERCPU_STATS bool "Collect percpu memory statistics" help diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5bd5bb097252..ce8dded36de9 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. +config DEBUG_SLAB + bool "Debug slab memory allocations" + depends on DEBUG_KERNEL && SLAB + help + Say Y here to have the kernel do limited verification on memory + allocation as well as poisoning memory on free to catch use of freed + memory. This can make kmalloc/kfree-intensive workloads much slower. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + select STACKDEPOT if STACKTRACE_SUPPORT + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config SLUB_DEBUG_ON + bool "SLUB debugging on by default" + depends on SLUB && SLUB_DEBUG + select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT + default n + help + Boot with debugging on by default. SLUB boots by default with + the runtime debug capabilities switched off. Enabling this is + equivalent to specifying the "slub_debug" parameter on boot. + There is no support for more fine grained debug control like + possible with slub_debug=xxx. SLUB debugging may be switched + off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying + "slub_debug=-". + config PAGE_OWNER bool "Track page owner" depends on DEBUG_KERNEL && STACKTRACE_SUPPORT diff --git a/mm/internal.h b/mm/internal.h index 64e61b032dac..c0f8fbe0445b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); -void split_free_page(struct page *free_page, - int order, unsigned long split_pfn_offset); +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset); #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 199d77cce21a..b341a191651d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag) va->addr, va->addr + va->size, va->caller); pr_err("\n"); - page = vmalloc_to_page(page); + page = vmalloc_to_page(addr); } } diff --git a/mm/madvise.c b/mm/madvise.c index 4d6592488b51..d7b4f2602949 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, if (!xa_is_value(page)) continue; + swap = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swap)) + continue; xas_pause(&xas); rcu_read_unlock(); - swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0, false, &splug); if (page) @@ -624,11 +627,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, swp_entry_t entry; entry = pte_to_swp_entry(ptent); - if (non_swap_entry(entry)) - continue; - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + if (!non_swap_entry(entry)) { + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } continue; } diff --git a/mm/memory.c b/mm/memory.c index 54bcd5327b74..21dadf03f089 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1487,7 +1487,8 @@ again: /* Only drop the uffd-wp marker if explicitly requested */ if (!zap_drop_file_uffd_wp(details)) continue; - } else if (is_hwpoison_entry(entry)) { + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { if (!should_zap_cows(details)) continue; } else { @@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; + } else if (is_swapin_error_entry(entry)) { + ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 149f2ab5063b..e008a3df0485 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; + /* + * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * a consistent read of the memory array, so that results, even though + * racy, are not corrupted. + */ + word = READ_ONCE(bitmap[word_bitidx]); return (word >> bitidx) & mask; } @@ -1100,30 +1104,44 @@ done_merging: * @order: the order of the page * @split_pfn_offset: split offset within the page * + * Return -ENOENT if the free page is changed, otherwise 0 + * * It is used when the free page crosses two pageblocks with different migratetypes * at split_pfn_offset within the page. The split free page will be put into * separate migratetype lists afterwards. Otherwise, the function achieves * nothing. */ -void split_free_page(struct page *free_page, - int order, unsigned long split_pfn_offset) +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset) { struct zone *zone = page_zone(free_page); unsigned long free_page_pfn = page_to_pfn(free_page); unsigned long pfn; unsigned long flags; int free_page_order; + int mt; + int ret = 0; if (split_pfn_offset == 0) - return; + return ret; spin_lock_irqsave(&zone->lock, flags); + + if (!PageBuddy(free_page) || buddy_order(free_page) != order) { + ret = -ENOENT; + goto out; + } + + mt = get_pageblock_migratetype(free_page); + if (likely(!is_migrate_isolate(mt))) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + del_page_from_free_list(free_page, zone, order); for (pfn = free_page_pfn; pfn < free_page_pfn + (1UL << order);) { int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); - free_page_order = min_t(int, + free_page_order = min_t(unsigned int, pfn ? __ffs(pfn) : order, __fls(split_pfn_offset)); __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, @@ -1134,7 +1152,9 @@ void split_free_page(struct page *free_page, if (split_pfn_offset == 0) split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); } +out: spin_unlock_irqrestore(&zone->lock, flags); + return ret; } /* * A bad page could be due to a number of fields. Instead of multiple branches, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c643c8420809..6021f8444b5a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * the in-use page then splitting the free page. */ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, - gfp_t gfp_flags, bool isolate_before) + gfp_t gfp_flags, bool isolate_before, bool skip_isolation) { unsigned char saved_mt; unsigned long start_pfn; @@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); - ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, - isolate_pageblock, isolate_pageblock + pageblock_nr_pages); - if (ret) - return ret; + if (skip_isolation) + VM_BUG_ON(!is_migrate_isolate(saved_mt)); + else { + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, + isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + + if (ret) + return ret; + } /* * Bail out early when the to-be-isolated pageblock does not form @@ -366,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, if (PageBuddy(page)) { int order = buddy_order(page); - if (pfn + (1UL << order) > boundary_pfn) - split_free_page(page, order, boundary_pfn - pfn); - pfn += (1UL << order); + if (pfn + (1UL << order) > boundary_pfn) { + /* free page changed before split, check it again */ + if (split_free_page(page, order, boundary_pfn - pfn)) + continue; + } + + pfn += 1UL << order; continue; } /* @@ -463,7 +472,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, return 0; failed: /* restore the original migratetype */ - unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); + if (!skip_isolation) + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); return -EBUSY; } @@ -522,14 +532,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; + bool skip_isolation = false; /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ - ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false); + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); if (ret) return ret; + if (isolate_start == isolate_end - pageblock_nr_pages) + skip_isolation = true; + /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ - ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true); + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); if (ret) { unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); return ret; diff --git a/mm/shmem.c b/mm/shmem.c index da30c769b376..a6f565308133 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping, continue; entry = radix_to_swp_entry(folio); + /* + * swapin error entries can be found in the mapping. But they're + * deliberately ignored here as we've done everything we can do. + */ if (swp_type(entry) != type) continue; @@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, return error; } +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, + struct folio *folio, swp_entry_t swap) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swapin_error; + void *old; + + swapin_error = make_swapin_error_entry(&folio->page); + old = xa_cmpxchg_irq(&mapping->i_pages, index, + swp_to_radix_entry(swap), + swp_to_radix_entry(swapin_error), 0); + if (old != swp_to_radix_entry(swap)) + return; + + folio_wait_writeback(folio); + delete_from_swap_cache(&folio->page); + spin_lock_irq(&info->lock); + /* + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't + * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in + * shmem_evict_inode. + */ + info->alloced--; + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + swap_free(swap); +} + /* * Swap in the page pointed to by *pagep. * Caller has to make sure that *pagep contains a valid swapped page. @@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*foliop); *foliop = NULL; + if (is_swapin_error_entry(swap)) + return -EIO; + /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); if (!page) { @@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; + if (error == -EIO) + shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) { folio_unlock(folio); @@ -1906,7 +1945,7 @@ alloc_nohuge: spin_lock_irq(&info->lock); info->alloced += folio_nr_pages(folio); - inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio); + inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); alloced = true; diff --git a/mm/swap_state.c b/mm/swap_state.c index b9e4ed2e90bf..778d57d2d92d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) return NULL; swp = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return NULL; /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) diff --git a/mm/swapfile.c b/mm/swapfile.c index 94b4ff43ead0..a2e66d855b19 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1775,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, { struct page *swapcache; spinlock_t *ptl; - pte_t *pte; + pte_t *pte, new_pte; int ret = 1; swapcache = page; @@ -1789,6 +1789,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + if (unlikely(!PageUptodate(page))) { + pte_t pteval; + + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + set_pte_at(vma->vm_mm, addr, pte, pteval); + swap_free(entry); + ret = 0; + goto out; + } + /* See do_swap_page() */ BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); @@ -1813,8 +1824,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); + new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*pte)) + new_pte = pte_mksoft_dirty(new_pte); + if (pte_swp_uffd_wp(*pte)) + new_pte = pte_mkuffd_wp(new_pte); + set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: pte_unmap_unlock(pte, ptl); diff --git a/mm/z3fold.c b/mm/z3fold.c index 83b5a3514427..f41f8b0d9e9a 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -181,6 +181,7 @@ enum z3fold_page_flags { NEEDS_COMPACTING, PAGE_STALE, PAGE_CLAIMED, /* by either reclaim or free */ + PAGE_MIGRATED, /* page is migrated and soon to be released */ }; /* @@ -212,10 +213,8 @@ static int size_to_chunks(size_t size) static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { - struct z3fold_buddy_slots *slots; - - slots = kmem_cache_zalloc(pool->c_handle, - (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); + struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, + gfp); if (slots) { /* It will be freed separately in free_handle(). */ @@ -272,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle) zhdr = (struct z3fold_header *)(addr & PAGE_MASK); locked = z3fold_page_trylock(zhdr); read_unlock(&slots->lock); - if (locked) - break; + if (locked) { + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_MIGRATED, &page->private)) + break; + z3fold_page_unlock(zhdr); + } cpu_relax(); } while (true); } else { @@ -391,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); + clear_bit(PAGE_MIGRATED, &page->private); if (headless) return zhdr; @@ -521,13 +526,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) atomic64_dec(&pool->pages_nr); } -static void release_z3fold_page(struct kref *ref) -{ - struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, - refcount); - __release_z3fold_page(zhdr, false); -} - static void release_z3fold_page_locked(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, @@ -940,10 +938,19 @@ lookup: } } - if (zhdr && !zhdr->slots) - zhdr->slots = alloc_slots(pool, - can_sleep ? GFP_NOIO : GFP_ATOMIC); + if (zhdr && !zhdr->slots) { + zhdr->slots = alloc_slots(pool, GFP_ATOMIC); + if (!zhdr->slots) + goto out_fail; + } return zhdr; + +out_fail: + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + add_to_unbuddied(pool, zhdr); + z3fold_page_unlock(zhdr); + } + return NULL; } /* @@ -1066,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, enum buddy bud; bool can_sleep = gfpflags_allow_blocking(gfp); - if (!size) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE) @@ -1093,28 +1100,7 @@ retry: bud = FIRST; } - page = NULL; - if (can_sleep) { - spin_lock(&pool->stale_lock); - zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); - /* - * Before allocating a page, let's see if we can take one from - * the stale pages list. cancel_work_sync() can sleep so we - * limit this case to the contexts where we can sleep - */ - if (zhdr) { - list_del(&zhdr->buddy); - spin_unlock(&pool->stale_lock); - cancel_work_sync(&zhdr->work); - page = virt_to_page(zhdr); - } else { - spin_unlock(&pool->stale_lock); - } - } - if (!page) - page = alloc_page(gfp); - + page = alloc_page(gfp); if (!page) return -ENOMEM; @@ -1134,10 +1120,9 @@ retry: __SetPageMovable(page, pool->inode->i_mapping); unlock_page(page); } else { - if (trylock_page(page)) { - __SetPageMovable(page, pool->inode->i_mapping); - unlock_page(page); - } + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); } z3fold_page_lock(zhdr); @@ -1236,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { - put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + put_z3fold_header(zhdr); return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { @@ -1332,12 +1317,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) break; } - if (kref_get_unless_zero(&zhdr->refcount) == 0) { - zhdr = NULL; - break; - } if (!z3fold_page_trylock(zhdr)) { - kref_put(&zhdr->refcount, release_z3fold_page); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1348,14 +1328,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) */ if (zhdr->foreign_handles || test_and_set_bit(PAGE_CLAIMED, &page->private)) { - if (!kref_put(&zhdr->refcount, - release_z3fold_page_locked)) - z3fold_page_unlock(zhdr); + z3fold_page_unlock(zhdr); zhdr = NULL; continue; /* can't evict such page */ } list_del_init(&zhdr->buddy); zhdr->cpu = -1; + /* See comment in __z3fold_alloc. */ + kref_get(&zhdr->refcount); break; } @@ -1437,8 +1417,10 @@ next: spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); - z3fold_page_unlock(zhdr); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); } /* We started off locked to we need to lock the pool back */ @@ -1590,8 +1572,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa if (!z3fold_page_trylock(zhdr)) return -EAGAIN; if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { - z3fold_page_unlock(zhdr); clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); return -EBUSY; } if (work_pending(&zhdr->work)) { @@ -1601,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa new_zhdr = page_address(newpage); memcpy(new_zhdr, zhdr, PAGE_SIZE); newpage->private = page->private; - page->private = 0; + set_bit(PAGE_MIGRATED, &page->private); z3fold_page_unlock(zhdr); spin_lock_init(&new_zhdr->page_lock); INIT_WORK(&new_zhdr->work, compact_page_work); @@ -1631,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - clear_bit(PAGE_CLAIMED, &page->private); + /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ + page->private = 0; put_page(page); return 0; } @@ -1653,6 +1636,8 @@ static void z3fold_page_putback(struct page *page) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } |