summaryrefslogtreecommitdiff
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/swapfile.c
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
downloadlwn-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.gz
lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.zip
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c135
1 files changed, 70 insertions, 65 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6aec1b24f440..94b4ff43ead0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,6 +45,7 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
@@ -77,9 +78,9 @@ static PLIST_HEAD(swap_active_head);
/*
* all available (active, not full) swap_info_structs
* protected with swap_avail_lock, ordered by priority.
- * This is used by get_swap_page() instead of swap_active_head
+ * This is used by folio_alloc_swap() instead of swap_active_head
* because swap_active_head includes all swap_info_structs,
- * but get_swap_page() doesn't need to look at full ones.
+ * but folio_alloc_swap() doesn't need to look at full ones.
* This uses its own lock instead of swap_lock because when a
* swap_info_struct changes between not-full/full, it needs to
* add/remove itself to/from this list, but the swap_info_struct->lock
@@ -775,6 +776,22 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
this_cpu_write(*si->cluster_next_cpu, next);
}
+static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ if (data_race(!si->swap_map[offset])) {
+ spin_lock(&si->lock);
+ return true;
+ }
+
+ if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
+ return true;
+ }
+
+ return false;
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -952,15 +969,8 @@ done:
scan:
spin_unlock(&si->lock);
while (++offset <= READ_ONCE(si->highest_bit)) {
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- goto checks;
- }
- if (vm_swap_full() &&
- READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
+ if (swap_offset_available_and_locked(si, offset))
goto checks;
- }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -969,15 +979,8 @@ scan:
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
+ if (swap_offset_available_and_locked(si, offset))
goto checks;
- }
- if (vm_swap_full() &&
- READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- goto checks;
- }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -1122,7 +1125,7 @@ noswap:
return n_ret;
}
-static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset;
@@ -1137,8 +1140,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
+ if (data_race(!p->swap_map[swp_offset(entry)]))
+ goto bad_free;
return p;
+bad_free:
+ pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
+ goto out;
bad_offset:
pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
goto out;
@@ -1151,23 +1159,6 @@ out:
return NULL;
}
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = __swap_info_get(entry);
- if (!p)
- goto out;
- if (data_race(!p->swap_map[swp_offset(entry)]))
- goto bad_free;
- return p;
-
-bad_free:
- pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
-out:
- return NULL;
-}
-
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
struct swap_info_struct *q)
{
@@ -1283,6 +1274,7 @@ bad_nofile:
out:
return NULL;
put_out:
+ pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
percpu_ref_put(&si->users);
return NULL;
}
@@ -1440,7 +1432,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-int page_swapcount(struct page *page)
+static int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
@@ -1797,13 +1789,28 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
+ /* See do_swap_page() */
+ BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
+ BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr, false);
+ rmap_t rmap_flags = RMAP_NONE;
+
+ /*
+ * See do_swap_page(): PageWriteback() would be problematic.
+ * However, we do a wait_on_page_writeback() just before this
+ * call and have the page locked.
+ */
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (pte_swp_exclusive(*pte))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(page, vma, addr, rmap_flags);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr, false);
+ page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
}
set_pte_at(vma->vm_mm, addr, pte,
@@ -1984,9 +1991,9 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
}
/*
- * Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use. Return 0
- * if there are no inuse entries after prev till end of the map.
+ * Scan swap_map from current position to next entry still in use.
+ * Return 0 if there are no inuse entries after prev till end of
+ * the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev)
@@ -2094,11 +2101,12 @@ retry:
* Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them.
*
- * Limit the number of retries? No: when mmget_not_zero() above fails,
- * that mm is likely to be freeing swap from exit_mmap(), which proceeds
- * at its own independent pace; and even shmem_writepage() could have
- * been preempted after get_swap_page(), temporarily hiding that swap.
- * It's easy and robust (though cpu-intensive) just to keep retrying.
+ * Limit the number of retries? No: when mmget_not_zero()
+ * above fails, that mm is likely to be freeing swap from
+ * exit_mmap(), which proceeds at its own independent pace;
+ * and even shmem_writepage() could have been preempted after
+ * folio_alloc_swap(), temporarily hiding that swap. It's easy
+ * and robust (though cpu-intensive) just to keep retrying.
*/
if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
@@ -2201,8 +2209,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
- * onto a contiguous range of disk blocks. An ordered list of swap extents
- * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * onto a contiguous range of disk blocks. A rbtree of swap extents is
+ * built at swapon time and is then used at swap_writepage/swap_readpage
* time for locating where on disk a page belongs.
*
* If the swapfile is an S_ISBLK block device, a single extent is installed.
@@ -2210,12 +2218,12 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
* swap files identically.
*
* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
- * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
+ * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
* swapfiles are handled *identically* after swapon time.
*
* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
- * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
- * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray
+ * blocks are found which do not fall within the PAGE_SIZE alignment
* requirements, they are simply tossed out - we will never use those blocks
* for swapping.
*
@@ -2224,10 +2232,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
*
* The amount of disk space which a single swap extent represents varies.
* Typically it is in the 1-4 megabyte range. So we can have hundreds of
- * extents in the list. To avoid much list walking, we cache the previous
- * search location in `curr_swap_extent', and start new searches from there.
- * This is extremely effective. The average number of iterations in
- * map_swap_page() has been measured at about 0.3 per page. - akpm.
+ * extents in the rbtree. - akpm.
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
@@ -2244,12 +2249,13 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
- if (ret >= 0)
- sis->flags |= SWP_ACTIVATED;
- if (!ret) {
- sis->flags |= SWP_FS_OPS;
- ret = add_swap_extent(sis, 0, sis->max, 0);
- *span = sis->pages;
+ if (ret < 0)
+ return ret;
+ sis->flags |= SWP_ACTIVATED;
+ if ((sis->flags & SWP_FS_OPS) &&
+ sio_pool_init() != 0) {
+ destroy_swap_extents(sis);
+ return -ENOMEM;
}
return ret;
}
@@ -2311,7 +2317,7 @@ static void _enable_swap_info(struct swap_info_struct *p)
* which on removal of any swap_info_struct with an auto-assigned
* (i.e. negative) priority increments the auto-assigned priority
* of any lower-priority swap_info_structs.
- * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
* which allocates swap pages from the highest available priority
* swap_info_struct.
*/
@@ -3314,8 +3320,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
- if (p)
- put_swap_device(p);
+ put_swap_device(p);
return err;
}