diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
commit | 98931dd95fd489fcbfa97da563505a6f071d7c77 (patch) | |
tree | 44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/swapfile.c | |
parent | df202b452fe6c6d6f1351bad485e2367ef1e644e (diff) | |
parent | f403f22f8ccb12860b2b62fec3173c6ccd45938b (diff) | |
download | lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.gz lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.zip |
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Almost all of MM here. A few things are still getting finished off,
reviewed, etc.
- Yang Shi has improved the behaviour of khugepaged collapsing of
readonly file-backed transparent hugepages.
- Johannes Weiner has arranged for zswap memory use to be tracked and
managed on a per-cgroup basis.
- Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for
runtime enablement of the recent huge page vmemmap optimization
feature.
- Baolin Wang contributes a series to fix some issues around hugetlb
pagetable invalidation.
- Zhenwei Pi has fixed some interactions between hwpoisoned pages and
virtualization.
- Tong Tiangen has enabled the use of the presently x86-only
page_table_check debugging feature on arm64 and riscv.
- David Vernet has done some fixup work on the memcg selftests.
- Peter Xu has taught userfaultfd to handle write protection faults
against shmem- and hugetlbfs-backed files.
- More DAMON development from SeongJae Park - adding online tuning of
the feature and support for monitoring of fixed virtual address
ranges. Also easier discovery of which monitoring operations are
available.
- Nadav Amit has done some optimization of TLB flushing during
mprotect().
- Neil Brown continues to labor away at improving our swap-over-NFS
support.
- David Hildenbrand has some fixes to anon page COWing versus
get_user_pages().
- Peng Liu fixed some errors in the core hugetlb code.
- Joao Martins has reduced the amount of memory consumed by
device-dax's compound devmaps.
- Some cleanups of the arch-specific pagemap code from Anshuman
Khandual.
- Muchun Song has found and fixed some errors in the TLB flushing of
transparent hugepages.
- Roman Gushchin has done more work on the memcg selftests.
... and, of course, many smaller fixes and cleanups. Notably, the
customary million cleanup serieses from Miaohe Lin"
* tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits)
mm: kfence: use PAGE_ALIGNED helper
selftests: vm: add the "settings" file with timeout variable
selftests: vm: add "test_hmm.sh" to TEST_FILES
selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests
selftests: vm: add migration to the .gitignore
selftests/vm/pkeys: fix typo in comment
ksm: fix typo in comment
selftests: vm: add process_mrelease tests
Revert "mm/vmscan: never demote for memcg reclaim"
mm/kfence: print disabling or re-enabling message
include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace"
include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion"
mm: fix a potential infinite loop in start_isolate_page_range()
MAINTAINERS: add Muchun as co-maintainer for HugeTLB
zram: fix Kconfig dependency warning
mm/shmem: fix shmem folio swapoff hang
cgroup: fix an error handling path in alloc_pagecache_max_30M()
mm: damon: use HPAGE_PMD_SIZE
tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
nodemask.h: fix compilation error with GCC12
...
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 135 |
1 files changed, 70 insertions, 65 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6aec1b24f440..94b4ff43ead0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -45,6 +45,7 @@ #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> +#include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); @@ -77,9 +78,9 @@ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. - * This is used by get_swap_page() instead of swap_active_head + * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, - * but get_swap_page() doesn't need to look at full ones. + * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock @@ -775,6 +776,22 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next) this_cpu_write(*si->cluster_next_cpu, next); } +static bool swap_offset_available_and_locked(struct swap_info_struct *si, + unsigned long offset) +{ + if (data_race(!si->swap_map[offset])) { + spin_lock(&si->lock); + return true; + } + + if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { + spin_lock(&si->lock); + return true; + } + + return false; +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -952,15 +969,8 @@ done: scan: spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); - goto checks; - } - if (vm_swap_full() && - READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); + if (swap_offset_available_and_locked(si, offset)) goto checks; - } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -969,15 +979,8 @@ scan: } offset = si->lowest_bit; while (offset < scan_base) { - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); + if (swap_offset_available_and_locked(si, offset)) goto checks; - } - if (vm_swap_full() && - READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - goto checks; - } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; @@ -1122,7 +1125,7 @@ noswap: return n_ret; } -static struct swap_info_struct *__swap_info_get(swp_entry_t entry) +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; unsigned long offset; @@ -1137,8 +1140,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; + if (data_race(!p->swap_map[swp_offset(entry)])) + goto bad_free; return p; +bad_free: + pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); + goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; @@ -1151,23 +1159,6 @@ out: return NULL; } -static struct swap_info_struct *_swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = __swap_info_get(entry); - if (!p) - goto out; - if (data_race(!p->swap_map[swp_offset(entry)])) - goto bad_free; - return p; - -bad_free: - pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); -out: - return NULL; -} - static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { @@ -1283,6 +1274,7 @@ bad_nofile: out: return NULL; put_out: + pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } @@ -1440,7 +1432,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -int page_swapcount(struct page *page) +static int page_swapcount(struct page *page) { int count = 0; struct swap_info_struct *p; @@ -1797,13 +1789,28 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + /* See do_swap_page() */ + BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); + BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr, false); + rmap_t rmap_flags = RMAP_NONE; + + /* + * See do_swap_page(): PageWriteback() would be problematic. + * However, we do a wait_on_page_writeback() just before this + * call and have the page locked. + */ + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (pte_swp_exclusive(*pte)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(page, vma, addr, rmap_flags); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr, false); + page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } set_pte_at(vma->vm_mm, addr, pte, @@ -1984,9 +1991,9 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) } /* - * Scan swap_map (or frontswap_map if frontswap parameter is true) - * from current position to next entry still in use. Return 0 - * if there are no inuse entries after prev till end of the map. + * Scan swap_map from current position to next entry still in use. + * Return 0 if there are no inuse entries after prev till end of + * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) @@ -2094,11 +2101,12 @@ retry: * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * - * Limit the number of retries? No: when mmget_not_zero() above fails, - * that mm is likely to be freeing swap from exit_mmap(), which proceeds - * at its own independent pace; and even shmem_writepage() could have - * been preempted after get_swap_page(), temporarily hiding that swap. - * It's easy and robust (though cpu-intensive) just to keep retrying. + * Limit the number of retries? No: when mmget_not_zero() + * above fails, that mm is likely to be freeing swap from + * exit_mmap(), which proceeds at its own independent pace; + * and even shmem_writepage() could have been preempted after + * folio_alloc_swap(), temporarily hiding that swap. It's easy + * and robust (though cpu-intensive) just to keep retrying. */ if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) @@ -2201,8 +2209,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages - * onto a contiguous range of disk blocks. An ordered list of swap extents - * is built at swapon time and is then used at swap_writepage/swap_readpage + * onto a contiguous range of disk blocks. A rbtree of swap extents is + * built at swapon time and is then used at swap_writepage/swap_readpage * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. @@ -2210,12 +2218,12 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap - * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks - * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If - * some stray blocks are found which do not fall within the PAGE_SIZE alignment + * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray + * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * @@ -2224,10 +2232,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of - * extents in the list. To avoid much list walking, we cache the previous - * search location in `curr_swap_extent', and start new searches from there. - * This is extremely effective. The average number of iterations in - * map_swap_page() has been measured at about 0.3 per page. - akpm. + * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { @@ -2244,12 +2249,13 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); - if (ret >= 0) - sis->flags |= SWP_ACTIVATED; - if (!ret) { - sis->flags |= SWP_FS_OPS; - ret = add_swap_extent(sis, 0, sis->max, 0); - *span = sis->pages; + if (ret < 0) + return ret; + sis->flags |= SWP_ACTIVATED; + if ((sis->flags & SWP_FS_OPS) && + sio_pool_init() != 0) { + destroy_swap_extents(sis); + return -ENOMEM; } return ret; } @@ -2311,7 +2317,7 @@ static void _enable_swap_info(struct swap_info_struct *p) * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. - * swap_avail_head needs to be priority ordered for get_swap_page(), + * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ @@ -3314,8 +3320,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); - if (p) - put_swap_device(p); + put_swap_device(p); return err; } |