diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
commit | 98931dd95fd489fcbfa97da563505a6f071d7c77 (patch) | |
tree | 44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/vmscan.c | |
parent | df202b452fe6c6d6f1351bad485e2367ef1e644e (diff) | |
parent | f403f22f8ccb12860b2b62fec3173c6ccd45938b (diff) | |
download | lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.gz lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.zip |
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Almost all of MM here. A few things are still getting finished off,
reviewed, etc.
- Yang Shi has improved the behaviour of khugepaged collapsing of
readonly file-backed transparent hugepages.
- Johannes Weiner has arranged for zswap memory use to be tracked and
managed on a per-cgroup basis.
- Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for
runtime enablement of the recent huge page vmemmap optimization
feature.
- Baolin Wang contributes a series to fix some issues around hugetlb
pagetable invalidation.
- Zhenwei Pi has fixed some interactions between hwpoisoned pages and
virtualization.
- Tong Tiangen has enabled the use of the presently x86-only
page_table_check debugging feature on arm64 and riscv.
- David Vernet has done some fixup work on the memcg selftests.
- Peter Xu has taught userfaultfd to handle write protection faults
against shmem- and hugetlbfs-backed files.
- More DAMON development from SeongJae Park - adding online tuning of
the feature and support for monitoring of fixed virtual address
ranges. Also easier discovery of which monitoring operations are
available.
- Nadav Amit has done some optimization of TLB flushing during
mprotect().
- Neil Brown continues to labor away at improving our swap-over-NFS
support.
- David Hildenbrand has some fixes to anon page COWing versus
get_user_pages().
- Peng Liu fixed some errors in the core hugetlb code.
- Joao Martins has reduced the amount of memory consumed by
device-dax's compound devmaps.
- Some cleanups of the arch-specific pagemap code from Anshuman
Khandual.
- Muchun Song has found and fixed some errors in the TLB flushing of
transparent hugepages.
- Roman Gushchin has done more work on the memcg selftests.
... and, of course, many smaller fixes and cleanups. Notably, the
customary million cleanup serieses from Miaohe Lin"
* tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits)
mm: kfence: use PAGE_ALIGNED helper
selftests: vm: add the "settings" file with timeout variable
selftests: vm: add "test_hmm.sh" to TEST_FILES
selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests
selftests: vm: add migration to the .gitignore
selftests/vm/pkeys: fix typo in comment
ksm: fix typo in comment
selftests: vm: add process_mrelease tests
Revert "mm/vmscan: never demote for memcg reclaim"
mm/kfence: print disabling or re-enabling message
include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace"
include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion"
mm: fix a potential infinite loop in start_isolate_page_range()
MAINTAINERS: add Muchun as co-maintainer for HugeTLB
zram: fix Kconfig dependency warning
mm/shmem: fix shmem folio swapoff hang
cgroup: fix an error handling path in alloc_pagecache_max_30M()
mm: damon: use HPAGE_PMD_SIZE
tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
nodemask.h: fix compilation error with GCC12
...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 476 |
1 files changed, 250 insertions, 226 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index edc89f26b738..f7d9a683e3a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -59,6 +59,7 @@ #include <linux/sched/sysctl.h> #include "internal.h" +#include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> @@ -527,13 +528,8 @@ static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) return false; - if (sc) { - if (sc->no_demotion) - return false; - /* It is pointless to do demotion in memcg reclaim */ - if (cgroup_reclaim(sc)) - return false; - } + if (sc && sc->no_demotion) + return false; if (next_demotion_node(nid) == NUMA_NO_NODE) return false; @@ -587,7 +583,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use - * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) + * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -595,7 +591,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, unsigned long size = 0; int zid; - for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid <= zone_idx; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; if (!managed_zone(zone)) @@ -1031,7 +1027,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; reclaimable += zone_reclaimable_pages(zone); @@ -1155,7 +1151,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct folio *folio, struct address_space *mapping) +static pageout_t pageout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug) { /* * If the folio is dirty, only perform writeback if that write @@ -1200,6 +1197,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping) .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1, + .swap_plug = plug, }; folio_set_reclaim(folio); @@ -1388,6 +1386,10 @@ static enum page_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return PAGEREF_ACTIVATE; + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return PAGEREF_KEEP; + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1411,14 +1413,14 @@ static enum page_references folio_check_references(struct folio *folio, /* * Activate file-backed executable folios after first usage. */ - if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio)) + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ - if (referenced_folio && !folio_test_swapbacked(folio)) + if (referenced_folio && folio_is_file_lru(folio)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; @@ -1432,7 +1434,10 @@ static void folio_check_dirty_writeback(struct folio *folio, /* * Anonymous pages are not handled by flushers and must be written - * from reclaim context. Do not stall reclaim based on them + * from reclaim context. Do not stall reclaim based on them. + * MADV_FREE anonymous pages are put into inactive file list too. + * They could be mistakenly treated as file lru. So further anon + * test is needed. */ if (!folio_is_file_lru(folio) || (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { @@ -1501,6 +1506,22 @@ static unsigned int demote_page_list(struct list_head *demote_pages, return nr_succeeded; } +static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_FS) + return true; + if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) + return false; + /* + * We can "enter_fs" for swap-cache with only __GFP_IO + * providing this isn't SWP_FS_OPS. + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1516,6 +1537,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; + struct swap_iocb *plug = NULL; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1524,41 +1546,36 @@ static unsigned int shrink_page_list(struct list_head *page_list, retry: while (!list_empty(page_list)) { struct address_space *mapping; - struct page *page; struct folio *folio; enum page_references references = PAGEREF_RECLAIM; - bool dirty, writeback, may_enter_fs; + bool dirty, writeback; unsigned int nr_pages; cond_resched(); folio = lru_to_folio(page_list); list_del(&folio->lru); - page = &folio->page; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); - /* Account the number of base pages even though THP */ + /* Account the number of base pages */ sc->nr_scanned += nr_pages; - if (unlikely(!page_evictable(page))) + if (unlikely(!folio_evictable(folio))) goto activate_locked; - if (!sc->may_unmap && page_mapped(page)) + if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing - * pages if the tail of the LRU is all dirty unqueued pages. + * folios if the tail of the LRU is all dirty unqueued folios. */ folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) @@ -1568,50 +1585,51 @@ retry: stat->nr_unqueued_dirty += nr_pages; /* - * Treat this page as congested if the underlying BDI is or if - * pages are cycling through the LRU so quickly that the - * pages marked for immediate reclaim are making it to the - * end of the LRU a second time. + * Treat this folio as congested if folios are cycling + * through the LRU so quickly that the folios marked + * for immediate reclaim are making it to the end of + * the LRU a second time. */ - mapping = page_mapping(page); - if (writeback && PageReclaim(page)) + if (writeback && folio_test_reclaim(folio)) stat->nr_congested += nr_pages; /* - * If a page at the tail of the LRU is under writeback, there + * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * - * 1) If reclaim is encountering an excessive number of pages - * under writeback and this page is both under writeback and - * PageReclaim then it indicates that pages are being queued - * for IO but are being recycled through the LRU before the - * IO can complete. Waiting on the page itself risks an - * indefinite stall if it is impossible to writeback the - * page due to IO error or disconnected storage so instead - * note that the LRU is being scanned too quickly and the - * caller can stall after page list has been processed. + * 1) If reclaim is encountering an excessive number + * of folios under writeback and this folio has both + * the writeback and reclaim flags set, then it + * indicates that folios are being queued for I/O but + * are being recycled through the LRU before the I/O + * can complete. Waiting on the folio itself risks an + * indefinite stall if it is impossible to writeback + * the folio due to I/O error or disconnected storage + * so instead note that the LRU is being scanned too + * quickly and the caller can stall after the folio + * list has been processed. * - * 2) Global or new memcg reclaim encounters a page that is + * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the page for immediate + * not to fs). In this case mark the folio for immediate * reclaim and continue scanning. * - * Require may_enter_fs because we would wait on fs, which - * may not have submitted IO yet. And the loop driver might - * enter reclaim, and deadlock if it waits on a page for + * Require may_enter_fs() because we would wait on fs, which + * may not have submitted I/O yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * 3) Legacy memcg encounters a page that is already marked - * PageReclaim. memcg does not have any dirty pages + * 3) Legacy memcg encounters a folio that already has the + * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many - * pages are in writeback and there is nothing else to + * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * - * In cases 1) and 2) we activate the pages to get them out of - * the way while we continue scanning for clean pages on the + * In cases 1) and 2) we activate the folios to get them out of + * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. @@ -1619,38 +1637,42 @@ retry: * memory pressure on the cache working set any longer than it * takes to write them to disk. */ - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { /* Case 1 above */ if (current_is_kswapd() && - PageReclaim(page) && + folio_test_reclaim(folio) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || - !PageReclaim(page) || !may_enter_fs) { + !folio_test_reclaim(folio) || + !may_enter_fs(folio, sc->gfp_mask)) { /* - * This is slightly racy - end_page_writeback() - * might have just cleared PageReclaim, then - * setting PageReclaim here end up interpreted - * as PageReadahead - but that does not matter - * enough to care. What we do want is for this - * page to have PageReclaim set next time memcg - * reclaim reaches the tests above, so it will - * then wait_on_page_writeback() to avoid OOM; - * and it's also appropriate in global reclaim. + * This is slightly racy - + * folio_end_writeback() might have + * just cleared the reclaim flag, then + * setting the reclaim flag here ends up + * interpreted as the readahead flag - but + * that does not matter enough to care. + * What we do want is for this folio to + * have the reclaim flag set next time + * memcg reclaim reaches the tests above, + * so it will then wait for writeback to + * avoid OOM; and it's also appropriate + * in global reclaim. */ - SetPageReclaim(page); + folio_set_reclaim(folio); stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ } else { - unlock_page(page); - wait_on_page_writeback(page); - /* then go back and try same page again */ - list_add_tail(&page->lru, page_list); + folio_unlock(folio); + folio_wait_writeback(folio); + /* then go back and try same folio again */ + list_add_tail(&folio->lru, page_list); continue; } } @@ -1666,37 +1688,37 @@ retry: goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: - ; /* try to reclaim the page below */ + ; /* try to reclaim the folio below */ } /* - * Before reclaiming the page, try to relocate + * Before reclaiming the folio, try to relocate * its contents to another node. */ if (do_demote_pass && - (thp_migration_supported() || !PageTransHuge(page))) { - list_add(&page->lru, &demote_pages); - unlock_page(page); + (thp_migration_supported() || !folio_test_large(folio))) { + list_add(&folio->lru, &demote_pages); + folio_unlock(folio); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. - * Lazyfree page could be freed directly + * Lazyfree folio could be freed directly */ - if (PageAnon(page) && PageSwapBacked(page)) { - if (!PageSwapCache(page)) { + if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { + if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; - if (PageTransHuge(page)) { - /* cannot split THP, skip it */ + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ if (!can_split_folio(folio, NULL)) goto activate_locked; /* - * Split pages without a PMD map right + * Split folios without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ @@ -1705,8 +1727,8 @@ retry: page_list)) goto activate_locked; } - if (!add_to_swap(page)) { - if (!PageTransHuge(page)) + if (!add_to_swap(folio)) { + if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, @@ -1715,94 +1737,92 @@ retry: #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); #endif - if (!add_to_swap(page)) + if (!add_to_swap(folio)) goto activate_locked_split; } - - may_enter_fs = true; - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); } - } else if (PageSwapBacked(page) && PageTransHuge(page)) { - /* Split shmem THP */ + } else if (folio_test_swapbacked(folio) && + folio_test_large(folio)) { + /* Split shmem folio */ if (split_folio_to_list(folio, page_list)) goto keep_locked; } /* - * THP may get split above, need minus tail pages and update - * nr_pages to avoid accounting tail pages twice. - * - * The tail pages that are added into swap cache successfully - * reach here. + * If the folio was split above, the tail pages will make + * their own pass through this function and be accounted + * then. */ - if ((nr_pages > 1) && !PageTransHuge(page)) { + if ((nr_pages > 1) && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } /* - * The page is mapped into the page tables of one or more + * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page)) { + if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; - bool was_swapbacked = PageSwapBacked(page); + bool was_swapbacked = folio_test_swapbacked(folio); - if (PageTransHuge(page) && - thp_order(page) >= HPAGE_PMD_ORDER) + if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; try_to_unmap(folio, flags); - if (page_mapped(page)) { + if (folio_mapped(folio)) { stat->nr_unmap_fail += nr_pages; - if (!was_swapbacked && PageSwapBacked(page)) + if (!was_swapbacked && + folio_test_swapbacked(folio)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } - if (PageDirty(page)) { + mapping = folio_mapping(folio); + if (folio_test_dirty(folio)) { /* - * Only kswapd can writeback filesystem pages + * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid - * injecting inefficient single-page IO into + * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only - * write pages when we've encountered many - * dirty pages, and when we've already scanned - * the rest of the LRU for clean pages and see - * the same dirty pages again (PageReclaim). + * write folios when we've encountered many + * dirty folios, and when we've already scanned + * the rest of the LRU for clean folios and see + * the same dirty folios again (with the reclaim + * flag set). */ - if (page_is_file_lru(page) && - (!current_is_kswapd() || !PageReclaim(page) || + if (folio_is_file_lru(folio) && + (!current_is_kswapd() || + !folio_test_reclaim(folio) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated + * Similar in principle to deactivate_page() + * except we already have the folio isolated * and know it's dirty */ - inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); + node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, + nr_pages); + folio_set_reclaim(folio); goto activate_locked; } if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; - if (!may_enter_fs) + if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; /* - * Page is dirty. Flush the TLB if a writable entry - * potentially exists to avoid CPU writes after IO + * Folio is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after I/O * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping)) { + switch (pageout(folio, mapping, &plug)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -1810,91 +1830,94 @@ retry: case PAGE_SUCCESS: stat->nr_pageout += nr_pages; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) goto keep; - if (PageDirty(page)) + if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go - * ahead and try to reclaim the page. + * ahead and try to reclaim the folio. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto keep; - if (PageDirty(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || + folio_test_writeback(folio)) goto keep_locked; - mapping = page_mapping(page); + mapping = folio_mapping(folio); fallthrough; case PAGE_CLEAN: - ; /* try to free the page below */ + ; /* try to free the folio below */ } } /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. + * If the folio has buffers, try to free the buffer + * mappings associated with this folio. If we succeed + * we try to free the folio as well. * - * We do this even if the page is PageDirty(). - * try_to_release_page() does not perform I/O, but it is - * possible for a page to have PageDirty set, but it is actually - * clean (all its buffers are clean). This happens if the - * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. - * try_to_release_page() will discover that cleanness and will - * drop the buffers and mark the page clean - it can be freed. + * We do this even if the folio is dirty. + * filemap_release_folio() does not perform I/O, but it + * is possible for a folio to have the dirty flag set, + * but it is actually clean (all its buffers are clean). + * This happens if the buffers were written out directly, + * with submit_bh(). ext3 will do this, as well as + * the blockdev mapping. filemap_release_folio() will + * discover that cleanness and will drop the buffers + * and mark the folio clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are - * the pages which were not successfully invalidated in - * truncate_cleanup_page(). We try to drop those buffers here - * and if that worked, and the page is no longer mapped into - * process address space (page_count == 1) it can be freed. - * Otherwise, leave the page on the LRU so it is swappable. + * Rarely, folios can have buffers and no ->mapping. + * These are the folios which were not successfully + * invalidated in truncate_cleanup_folio(). We try to + * drop those buffers here and if that worked, and the + * folio is no longer mapped into process address space + * (refcount == 1) it can be freed. Otherwise, leave + * the folio on the LRU so it is swappable. */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) + if (folio_has_private(folio)) { + if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) + if (!mapping && folio_ref_count(folio) == 1) { + folio_unlock(folio); + if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free - * this page shortly, so we may + * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ - nr_reclaimed++; + nr_reclaimed += nr_pages; continue; } } } - if (PageAnon(page) && !PageSwapBacked(page)) { + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ - if (!page_ref_freeze(page, 1)) + if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* - * The page has only one reference left, which is + * The folio has only one reference left, which is * from the isolation. After the caller puts the - * page back on lru and drops the reference, the - * page will be freed anyway. It doesn't matter - * which lru it goes. So we don't bother checking - * PageDirty here. + * folio back on the lru and drops the reference, the + * folio will be freed anyway. It doesn't matter + * which lru it goes on. So we don't bother checking + * the dirty flag here. */ - count_vm_event(PGLAZYFREED); - count_memcg_page_event(page, PGLAZYFREED); + count_vm_events(PGLAZYFREED, nr_pages); + count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; - unlock_page(page); + folio_unlock(folio); free_it: /* - * THP may get swapped out in a whole, need account - * all base pages. + * Folio may get swapped out as a whole, need to account + * all pages in it. */ nr_reclaimed += nr_pages; @@ -1902,10 +1925,10 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) - destroy_compound_page(page); + if (unlikely(folio_test_large(folio))) + destroy_compound_page(&folio->page); else - list_add(&page->lru, &free_pages); + list_add(&folio->lru, &free_pages); continue; activate_locked_split: @@ -1919,29 +1942,31 @@ activate_locked_split: } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || - PageMlocked(page))) - try_to_free_swap(page); - VM_BUG_ON_PAGE(PageActive(page), page); - if (!PageMlocked(page)) { - int type = page_is_file_lru(page); - SetPageActive(page); + if (folio_test_swapcache(folio) && + (mem_cgroup_swap_full(&folio->page) || + folio_test_mlocked(folio))) + try_to_free_swap(&folio->page); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + if (!folio_test_mlocked(folio)) { + int type = folio_is_file_lru(folio); + folio_set_active(folio); stat->nr_activate[type] += nr_pages; - count_memcg_page_event(page, PGACTIVATE); + count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: - unlock_page(page); + folio_unlock(folio); keep: - list_add(&page->lru, &ret_pages); - VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); + list_add(&folio->lru, &ret_pages); + VM_BUG_ON_FOLIO(folio_test_lru(folio) || + folio_test_unevictable(folio), folio); } /* 'page_list' is always empty here */ - /* Migrate pages selected for demotion */ + /* Migrate folios selected for demotion */ nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Pages that could not be demoted are still in @demote_pages */ + /* Folios that could not be demoted are still in @demote_pages */ if (!list_empty(&demote_pages)) { - /* Pages which failed to demoted go back on @page_list for retry: */ + /* Folios which weren't demoted go back on @page_list for retry: */ list_splice_init(&demote_pages, page_list); do_demote_pass = false; goto retry; @@ -1956,6 +1981,8 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + if (plug) + swap_write_unplug(plug); return nr_reclaimed; } @@ -2117,8 +2144,8 @@ move: * Splice any skipped pages to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX - * scanning would soon rescan the same pages to skip and put the - * system at risk of premature OOM. + * scanning would soon rescan the same pages to skip and waste lots + * of cpu cycles. */ if (!list_empty(&pages_skipped)) { int zid; @@ -2297,10 +2324,9 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, } /* - * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. - * In that case we should only throttle if the backing device it is - * writing to is congested. In other cases it is safe to throttle. + * If a kernel thread (such as nfsd for loop-back mounts) services a backing + * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case + * we should not throttle. Otherwise it is safe to do so. */ static int current_may_throttle(void) { @@ -2472,8 +2498,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } } + /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, - &vm_flags)) { + &vm_flags) != 0) { /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -2517,14 +2544,12 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -unsigned long reclaim_pages(struct list_head *page_list) +static unsigned int reclaim_page_list(struct list_head *page_list, + struct pglist_data *pgdat) { - int nid = NUMA_NO_NODE; - unsigned int nr_reclaimed = 0; - LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; - struct page *page; - unsigned int noreclaim_flag; + unsigned int nr_reclaimed; + struct folio *folio; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_writepage = 1, @@ -2533,14 +2558,32 @@ unsigned long reclaim_pages(struct list_head *page_list) .no_demotion = 1, }; + nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(page_list)) { + folio = lru_to_folio(page_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + + return nr_reclaimed; +} + +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid; + unsigned int nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct page *page; + unsigned int noreclaim_flag; + + if (list_empty(page_list)) + return nr_reclaimed; + noreclaim_flag = memalloc_noreclaim_save(); - while (!list_empty(page_list)) { + nid = page_to_nid(lru_to_page(page_list)); + do { page = lru_to_page(page_list); - if (nid == NUMA_NO_NODE) { - nid = page_to_nid(page); - INIT_LIST_HEAD(&node_page_list); - } if (nid == page_to_nid(page)) { ClearPageActive(page); @@ -2548,28 +2591,11 @@ unsigned long reclaim_pages(struct list_head *page_list) continue; } - nr_reclaimed += shrink_page_list(&node_page_list, - NODE_DATA(nid), - &sc, &dummy_stat, false); - while (!list_empty(&node_page_list)) { - page = lru_to_page(&node_page_list); - list_del(&page->lru); - putback_lru_page(page); - } - - nid = NUMA_NO_NODE; - } + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); + nid = page_to_nid(lru_to_page(page_list)); + } while (!list_empty(page_list)); - if (!list_empty(&node_page_list)) { - nr_reclaimed += shrink_page_list(&node_page_list, - NODE_DATA(nid), - &sc, &dummy_stat, false); - while (!list_empty(&node_page_list)) { - page = lru_to_page(&node_page_list); - list_del(&page->lru); - putback_lru_page(page); - } - } + nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -2646,9 +2672,7 @@ enum scan_balance { /* * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined - * by looking at the fraction of the pages scanned we did rotate back - * onto the active list instead of evict. + * scanned. * * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan @@ -3912,7 +3936,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) } /* - * If a node has no populated zone within highest_zoneidx, it does not + * If a node has no managed zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -4552,7 +4576,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) /* * This kswapd start function will be called by init and node-hot-add. - * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ void kswapd_run(int nid) { @@ -4699,7 +4722,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(p, &sc.reclaim_state); - if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. |