diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
commit | 98931dd95fd489fcbfa97da563505a6f071d7c77 (patch) | |
tree | 44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/page_io.c | |
parent | df202b452fe6c6d6f1351bad485e2367ef1e644e (diff) | |
parent | f403f22f8ccb12860b2b62fec3173c6ccd45938b (diff) | |
download | lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.gz lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.zip |
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Almost all of MM here. A few things are still getting finished off,
reviewed, etc.
- Yang Shi has improved the behaviour of khugepaged collapsing of
readonly file-backed transparent hugepages.
- Johannes Weiner has arranged for zswap memory use to be tracked and
managed on a per-cgroup basis.
- Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for
runtime enablement of the recent huge page vmemmap optimization
feature.
- Baolin Wang contributes a series to fix some issues around hugetlb
pagetable invalidation.
- Zhenwei Pi has fixed some interactions between hwpoisoned pages and
virtualization.
- Tong Tiangen has enabled the use of the presently x86-only
page_table_check debugging feature on arm64 and riscv.
- David Vernet has done some fixup work on the memcg selftests.
- Peter Xu has taught userfaultfd to handle write protection faults
against shmem- and hugetlbfs-backed files.
- More DAMON development from SeongJae Park - adding online tuning of
the feature and support for monitoring of fixed virtual address
ranges. Also easier discovery of which monitoring operations are
available.
- Nadav Amit has done some optimization of TLB flushing during
mprotect().
- Neil Brown continues to labor away at improving our swap-over-NFS
support.
- David Hildenbrand has some fixes to anon page COWing versus
get_user_pages().
- Peng Liu fixed some errors in the core hugetlb code.
- Joao Martins has reduced the amount of memory consumed by
device-dax's compound devmaps.
- Some cleanups of the arch-specific pagemap code from Anshuman
Khandual.
- Muchun Song has found and fixed some errors in the TLB flushing of
transparent hugepages.
- Roman Gushchin has done more work on the memcg selftests.
... and, of course, many smaller fixes and cleanups. Notably, the
customary million cleanup serieses from Miaohe Lin"
* tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits)
mm: kfence: use PAGE_ALIGNED helper
selftests: vm: add the "settings" file with timeout variable
selftests: vm: add "test_hmm.sh" to TEST_FILES
selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests
selftests: vm: add migration to the .gitignore
selftests/vm/pkeys: fix typo in comment
ksm: fix typo in comment
selftests: vm: add process_mrelease tests
Revert "mm/vmscan: never demote for memcg reclaim"
mm/kfence: print disabling or re-enabling message
include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace"
include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion"
mm: fix a potential infinite loop in start_isolate_page_range()
MAINTAINERS: add Muchun as co-maintainer for HugeTLB
zram: fix Kconfig dependency warning
mm/shmem: fix shmem folio swapoff hang
cgroup: fix an error handling path in alloc_pagecache_max_30M()
mm: damon: use HPAGE_PMD_SIZE
tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
nodemask.h: fix compilation error with GCC12
...
Diffstat (limited to 'mm/page_io.c')
-rw-r--r-- | mm/page_io.c | 253 |
1 files changed, 192 insertions, 61 deletions
diff --git a/mm/page_io.c b/mm/page_io.c index a9444e67ec20..68318134dc92 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,6 +26,7 @@ #include <linux/uio.h> #include <linux/sched/task.h> #include <linux/delayacct.h> +#include "swap.h" void end_swap_bio_write(struct bio *bio) { @@ -234,55 +235,119 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) #define bio_associate_blkg_from_page(bio, page) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func) +struct swap_iocb { + struct kiocb iocb; + struct bio_vec bvec[SWAP_CLUSTER_MAX]; + int pages; + int len; +}; +static mempool_t *sio_pool; + +int sio_pool_init(void) { - struct bio *bio; - int ret; - struct swap_info_struct *sis = page_swap_info(page); + if (!sio_pool) { + mempool_t *pool = mempool_create_kmalloc_pool( + SWAP_CLUSTER_MAX, sizeof(struct swap_iocb)); + if (cmpxchg(&sio_pool, NULL, pool)) + mempool_destroy(pool); + } + if (!sio_pool) + return -ENOMEM; + return 0; +} - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - if (data_race(sis->flags & SWP_FS_OPS)) { - struct kiocb kiocb; - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - struct bio_vec bv = { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0 - }; - struct iov_iter from; - - iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE); - init_sync_kiocb(&kiocb, swap_file); - kiocb.ki_pos = page_file_offset(page); +static void sio_write_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + struct page *page = sio->bvec[0].bv_page; + int p; - set_page_writeback(page); - unlock_page(page); - ret = mapping->a_ops->direct_IO(&kiocb, &from); - if (ret == PAGE_SIZE) { - count_vm_event(PSWPOUT); - ret = 0; - } else { - /* - * In the case of swap-over-nfs, this can be a - * temporary failure if the system has limited - * memory for allocating transmit buffers. - * Mark the page dirty and avoid - * folio_rotate_reclaimable but rate-limit the - * messages but do not flag PageError like - * the normal direct-to-bio case as it could - * be temporary. - */ + if (ret != sio->len) { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * folio_rotate_reclaimable but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ + pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", + ret, page_file_offset(page)); + for (p = 0; p < sio->pages; p++) { + page = sio->bvec[p].bv_page; set_page_dirty(page); ClearPageReclaim(page); - pr_err_ratelimited("Write error on dio swapfile (%llu)\n", - page_file_offset(page)); } - end_page_writeback(page); - return ret; + } else { + for (p = 0; p < sio->pages; p++) + count_swpout_vm_event(sio->bvec[p].bv_page); } + for (p = 0; p < sio->pages; p++) + end_page_writeback(sio->bvec[p].bv_page); + + mempool_free(sio, sio_pool); +} + +static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) +{ + struct swap_iocb *sio = NULL; + struct swap_info_struct *sis = page_swap_info(page); + struct file *swap_file = sis->swap_file; + loff_t pos = page_file_offset(page); + + set_page_writeback(page); + unlock_page(page); + if (wbc->swap_plug) + sio = *wbc->swap_plug; + if (sio) { + if (sio->iocb.ki_filp != swap_file || + sio->iocb.ki_pos + sio->len != pos) { + swap_write_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_NOIO); + init_sync_kiocb(&sio->iocb, swap_file); + sio->iocb.ki_complete = sio_write_complete; + sio->iocb.ki_pos = pos; + sio->pages = 0; + sio->len = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = thp_size(page); + sio->bvec[sio->pages].bv_offset = 0; + sio->len += thp_size(page); + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { + swap_write_unplug(sio); + sio = NULL; + } + if (wbc->swap_plug) + *wbc->swap_plug = sio; + + return 0; +} + +int __swap_writepage(struct page *page, struct writeback_control *wbc, + bio_end_io_t end_write_func) +{ + struct bio *bio; + int ret; + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + /* + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + if (data_race(sis->flags & SWP_FS_OPS)) + return swap_writepage_fs(page, wbc); + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { count_swpout_vm_event(page); @@ -305,7 +370,83 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } -int swap_readpage(struct page *page, bool synchronous) +void swap_write_unplug(struct swap_iocb *sio) +{ + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; + + iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_write_complete(&sio->iocb, ret); +} + +static void sio_read_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + int p; + + if (ret == sio->len) { + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + + SetPageUptodate(page); + unlock_page(page); + } + count_vm_events(PSWPIN, sio->pages); + } else { + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + + SetPageError(page); + ClearPageUptodate(page); + unlock_page(page); + } + pr_alert_ratelimited("Read-error on swap-device\n"); + } + mempool_free(sio, sio_pool); +} + +static void swap_readpage_fs(struct page *page, + struct swap_iocb **plug) +{ + struct swap_info_struct *sis = page_swap_info(page); + struct swap_iocb *sio = NULL; + loff_t pos = page_file_offset(page); + + if (plug) + sio = *plug; + if (sio) { + if (sio->iocb.ki_filp != sis->swap_file || + sio->iocb.ki_pos + sio->len != pos) { + swap_read_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_KERNEL); + init_sync_kiocb(&sio->iocb, sis->swap_file); + sio->iocb.ki_pos = pos; + sio->iocb.ki_complete = sio_read_complete; + sio->pages = 0; + sio->len = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = thp_size(page); + sio->bvec[sio->pages].bv_offset = 0; + sio->len += thp_size(page); + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + swap_read_unplug(sio); + sio = NULL; + } + if (plug) + *plug = sio; +} + +int swap_readpage(struct page *page, bool synchronous, + struct swap_iocb **plug) { struct bio *bio; int ret = 0; @@ -333,12 +474,7 @@ int swap_readpage(struct page *page, bool synchronous) } if (data_race(sis->flags & SWP_FS_OPS)) { - struct file *swap_file = sis->swap_file; - struct address_space *mapping = swap_file->f_mapping; - - ret = mapping->a_ops->read_folio(swap_file, page_folio(page)); - if (!ret) - count_vm_event(PSWPIN); + swap_readpage_fs(page, plug); goto out; } @@ -383,19 +519,14 @@ out: return ret; } -bool swap_dirty_folio(struct address_space *mapping, struct folio *folio) +void __swap_read_unplug(struct swap_iocb *sio) { - struct swap_info_struct *sis = swp_swap_info(folio_swap_entry(folio)); - - if (data_race(sis->flags & SWP_FS_OPS)) { - const struct address_space_operations *aops; - - mapping = sis->swap_file->f_mapping; - aops = mapping->a_ops; + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - return aops->dirty_folio(mapping, folio); - } else { - return noop_dirty_folio(mapping, folio); - } + iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_read_complete(&sio->iocb, ret); } |