summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/vmscan.c
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
downloadlwn-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.gz
lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.zip
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c476
1 files changed, 250 insertions, 226 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index edc89f26b738..f7d9a683e3a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -59,6 +59,7 @@
#include <linux/sched/sysctl.h>
#include "internal.h"
+#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
@@ -527,13 +528,8 @@ static bool can_demote(int nid, struct scan_control *sc)
{
if (!numa_demotion_enabled)
return false;
- if (sc) {
- if (sc->no_demotion)
- return false;
- /* It is pointless to do demotion in memcg reclaim */
- if (cgroup_reclaim(sc))
- return false;
- }
+ if (sc && sc->no_demotion)
+ return false;
if (next_demotion_node(nid) == NUMA_NO_NODE)
return false;
@@ -587,7 +583,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
* @lru: lru to use
- * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/
static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zone_idx)
@@ -595,7 +591,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
unsigned long size = 0;
int zid;
- for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid <= zone_idx; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
if (!managed_zone(zone))
@@ -1031,7 +1027,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;
reclaimable += zone_reclaimable_pages(zone);
@@ -1155,7 +1151,8 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct folio *folio, struct address_space *mapping)
+static pageout_t pageout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug)
{
/*
* If the folio is dirty, only perform writeback if that write
@@ -1200,6 +1197,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping)
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
+ .swap_plug = plug,
};
folio_set_reclaim(folio);
@@ -1388,6 +1386,10 @@ static enum page_references folio_check_references(struct folio *folio,
if (vm_flags & VM_LOCKED)
return PAGEREF_ACTIVATE;
+ /* rmap lock contention: rotate */
+ if (referenced_ptes == -1)
+ return PAGEREF_KEEP;
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -1411,14 +1413,14 @@ static enum page_references folio_check_references(struct folio *folio,
/*
* Activate file-backed executable folios after first usage.
*/
- if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio))
+ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */
- if (referenced_folio && !folio_test_swapbacked(folio))
+ if (referenced_folio && folio_is_file_lru(folio))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
@@ -1432,7 +1434,10 @@ static void folio_check_dirty_writeback(struct folio *folio,
/*
* Anonymous pages are not handled by flushers and must be written
- * from reclaim context. Do not stall reclaim based on them
+ * from reclaim context. Do not stall reclaim based on them.
+ * MADV_FREE anonymous pages are put into inactive file list too.
+ * They could be mistakenly treated as file lru. So further anon
+ * test is needed.
*/
if (!folio_is_file_lru(folio) ||
(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
@@ -1501,6 +1506,22 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return nr_succeeded;
}
+static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
+{
+ if (gfp_mask & __GFP_FS)
+ return true;
+ if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
+ return false;
+ /*
+ * We can "enter_fs" for swap-cache with only __GFP_IO
+ * providing this isn't SWP_FS_OPS.
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1516,6 +1537,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
+ struct swap_iocb *plug = NULL;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1524,41 +1546,36 @@ static unsigned int shrink_page_list(struct list_head *page_list,
retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
- struct page *page;
struct folio *folio;
enum page_references references = PAGEREF_RECLAIM;
- bool dirty, writeback, may_enter_fs;
+ bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
folio = lru_to_folio(page_list);
list_del(&folio->lru);
- page = &folio->page;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto keep;
- VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
- /* Account the number of base pages even though THP */
+ /* Account the number of base pages */
sc->nr_scanned += nr_pages;
- if (unlikely(!page_evictable(page)))
+ if (unlikely(!folio_evictable(folio)))
goto activate_locked;
- if (!sc->may_unmap && page_mapped(page))
+ if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
- * pages if the tail of the LRU is all dirty unqueued pages.
+ * folios if the tail of the LRU is all dirty unqueued folios.
*/
folio_check_dirty_writeback(folio, &dirty, &writeback);
if (dirty || writeback)
@@ -1568,50 +1585,51 @@ retry:
stat->nr_unqueued_dirty += nr_pages;
/*
- * Treat this page as congested if the underlying BDI is or if
- * pages are cycling through the LRU so quickly that the
- * pages marked for immediate reclaim are making it to the
- * end of the LRU a second time.
+ * Treat this folio as congested if folios are cycling
+ * through the LRU so quickly that the folios marked
+ * for immediate reclaim are making it to the end of
+ * the LRU a second time.
*/
- mapping = page_mapping(page);
- if (writeback && PageReclaim(page))
+ if (writeback && folio_test_reclaim(folio))
stat->nr_congested += nr_pages;
/*
- * If a page at the tail of the LRU is under writeback, there
+ * If a folio at the tail of the LRU is under writeback, there
* are three cases to consider.
*
- * 1) If reclaim is encountering an excessive number of pages
- * under writeback and this page is both under writeback and
- * PageReclaim then it indicates that pages are being queued
- * for IO but are being recycled through the LRU before the
- * IO can complete. Waiting on the page itself risks an
- * indefinite stall if it is impossible to writeback the
- * page due to IO error or disconnected storage so instead
- * note that the LRU is being scanned too quickly and the
- * caller can stall after page list has been processed.
+ * 1) If reclaim is encountering an excessive number
+ * of folios under writeback and this folio has both
+ * the writeback and reclaim flags set, then it
+ * indicates that folios are being queued for I/O but
+ * are being recycled through the LRU before the I/O
+ * can complete. Waiting on the folio itself risks an
+ * indefinite stall if it is impossible to writeback
+ * the folio due to I/O error or disconnected storage
+ * so instead note that the LRU is being scanned too
+ * quickly and the caller can stall after the folio
+ * list has been processed.
*
- * 2) Global or new memcg reclaim encounters a page that is
+ * 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the page for immediate
+ * not to fs). In this case mark the folio for immediate
* reclaim and continue scanning.
*
- * Require may_enter_fs because we would wait on fs, which
- * may not have submitted IO yet. And the loop driver might
- * enter reclaim, and deadlock if it waits on a page for
+ * Require may_enter_fs() because we would wait on fs, which
+ * may not have submitted I/O yet. And the loop driver might
+ * enter reclaim, and deadlock if it waits on a folio for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) Legacy memcg encounters a page that is already marked
- * PageReclaim. memcg does not have any dirty pages
+ * 3) Legacy memcg encounters a folio that already has the
+ * reclaim flag set. memcg does not have any dirty folio
* throttling so we could easily OOM just because too many
- * pages are in writeback and there is nothing else to
+ * folios are in writeback and there is nothing else to
* reclaim. Wait for the writeback to complete.
*
- * In cases 1) and 2) we activate the pages to get them out of
- * the way while we continue scanning for clean pages on the
+ * In cases 1) and 2) we activate the folios to get them out of
+ * the way while we continue scanning for clean folios on the
* inactive list and refilling from the active list. The
* observation here is that waiting for disk writes is more
* expensive than potentially causing reloads down the line.
@@ -1619,38 +1637,42 @@ retry:
* memory pressure on the cache working set any longer than it
* takes to write them to disk.
*/
- if (PageWriteback(page)) {
+ if (folio_test_writeback(folio)) {
/* Case 1 above */
if (current_is_kswapd() &&
- PageReclaim(page) &&
+ folio_test_reclaim(folio) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate += nr_pages;
goto activate_locked;
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
- !PageReclaim(page) || !may_enter_fs) {
+ !folio_test_reclaim(folio) ||
+ !may_enter_fs(folio, sc->gfp_mask)) {
/*
- * This is slightly racy - end_page_writeback()
- * might have just cleared PageReclaim, then
- * setting PageReclaim here end up interpreted
- * as PageReadahead - but that does not matter
- * enough to care. What we do want is for this
- * page to have PageReclaim set next time memcg
- * reclaim reaches the tests above, so it will
- * then wait_on_page_writeback() to avoid OOM;
- * and it's also appropriate in global reclaim.
+ * This is slightly racy -
+ * folio_end_writeback() might have
+ * just cleared the reclaim flag, then
+ * setting the reclaim flag here ends up
+ * interpreted as the readahead flag - but
+ * that does not matter enough to care.
+ * What we do want is for this folio to
+ * have the reclaim flag set next time
+ * memcg reclaim reaches the tests above,
+ * so it will then wait for writeback to
+ * avoid OOM; and it's also appropriate
+ * in global reclaim.
*/
- SetPageReclaim(page);
+ folio_set_reclaim(folio);
stat->nr_writeback += nr_pages;
goto activate_locked;
/* Case 3 above */
} else {
- unlock_page(page);
- wait_on_page_writeback(page);
- /* then go back and try same page again */
- list_add_tail(&page->lru, page_list);
+ folio_unlock(folio);
+ folio_wait_writeback(folio);
+ /* then go back and try same folio again */
+ list_add_tail(&folio->lru, page_list);
continue;
}
}
@@ -1666,37 +1688,37 @@ retry:
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
- ; /* try to reclaim the page below */
+ ; /* try to reclaim the folio below */
}
/*
- * Before reclaiming the page, try to relocate
+ * Before reclaiming the folio, try to relocate
* its contents to another node.
*/
if (do_demote_pass &&
- (thp_migration_supported() || !PageTransHuge(page))) {
- list_add(&page->lru, &demote_pages);
- unlock_page(page);
+ (thp_migration_supported() || !folio_test_large(folio))) {
+ list_add(&folio->lru, &demote_pages);
+ folio_unlock(folio);
continue;
}
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
- * Lazyfree page could be freed directly
+ * Lazyfree folio could be freed directly
*/
- if (PageAnon(page) && PageSwapBacked(page)) {
- if (!PageSwapCache(page)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
+ if (!folio_test_swapcache(folio)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (folio_maybe_dma_pinned(folio))
goto keep_locked;
- if (PageTransHuge(page)) {
- /* cannot split THP, skip it */
+ if (folio_test_large(folio)) {
+ /* cannot split folio, skip it */
if (!can_split_folio(folio, NULL))
goto activate_locked;
/*
- * Split pages without a PMD map right
+ * Split folios without a PMD map right
* away. Chances are some or all of the
* tail pages can be freed without IO.
*/
@@ -1705,8 +1727,8 @@ retry:
page_list))
goto activate_locked;
}
- if (!add_to_swap(page)) {
- if (!PageTransHuge(page))
+ if (!add_to_swap(folio)) {
+ if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_folio_to_list(folio,
@@ -1715,94 +1737,92 @@ retry:
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
- if (!add_to_swap(page))
+ if (!add_to_swap(folio))
goto activate_locked_split;
}
-
- may_enter_fs = true;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
}
- } else if (PageSwapBacked(page) && PageTransHuge(page)) {
- /* Split shmem THP */
+ } else if (folio_test_swapbacked(folio) &&
+ folio_test_large(folio)) {
+ /* Split shmem folio */
if (split_folio_to_list(folio, page_list))
goto keep_locked;
}
/*
- * THP may get split above, need minus tail pages and update
- * nr_pages to avoid accounting tail pages twice.
- *
- * The tail pages that are added into swap cache successfully
- * reach here.
+ * If the folio was split above, the tail pages will make
+ * their own pass through this function and be accounted
+ * then.
*/
- if ((nr_pages > 1) && !PageTransHuge(page)) {
+ if ((nr_pages > 1) && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/*
- * The page is mapped into the page tables of one or more
+ * The folio is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
enum ttu_flags flags = TTU_BATCH_FLUSH;
- bool was_swapbacked = PageSwapBacked(page);
+ bool was_swapbacked = folio_test_swapbacked(folio);
- if (PageTransHuge(page) &&
- thp_order(page) >= HPAGE_PMD_ORDER)
+ if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD;
try_to_unmap(folio, flags);
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
stat->nr_unmap_fail += nr_pages;
- if (!was_swapbacked && PageSwapBacked(page))
+ if (!was_swapbacked &&
+ folio_test_swapbacked(folio))
stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
- if (PageDirty(page)) {
+ mapping = folio_mapping(folio);
+ if (folio_test_dirty(folio)) {
/*
- * Only kswapd can writeback filesystem pages
+ * Only kswapd can writeback filesystem folios
* to avoid risk of stack overflow. But avoid
- * injecting inefficient single-page IO into
+ * injecting inefficient single-folio I/O into
* flusher writeback as much as possible: only
- * write pages when we've encountered many
- * dirty pages, and when we've already scanned
- * the rest of the LRU for clean pages and see
- * the same dirty pages again (PageReclaim).
+ * write folios when we've encountered many
+ * dirty folios, and when we've already scanned
+ * the rest of the LRU for clean folios and see
+ * the same dirty folios again (with the reclaim
+ * flag set).
*/
- if (page_is_file_lru(page) &&
- (!current_is_kswapd() || !PageReclaim(page) ||
+ if (folio_is_file_lru(folio) &&
+ (!current_is_kswapd() ||
+ !folio_test_reclaim(folio) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
- * Similar in principal to deactivate_page()
- * except we already have the page isolated
+ * Similar in principle to deactivate_page()
+ * except we already have the folio isolated
* and know it's dirty
*/
- inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
- SetPageReclaim(page);
+ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
+ nr_pages);
+ folio_set_reclaim(folio);
goto activate_locked;
}
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
- if (!may_enter_fs)
+ if (!may_enter_fs(folio, sc->gfp_mask))
goto keep_locked;
if (!sc->may_writepage)
goto keep_locked;
/*
- * Page is dirty. Flush the TLB if a writable entry
- * potentially exists to avoid CPU writes after IO
+ * Folio is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after I/O
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping)) {
+ switch (pageout(folio, mapping, &plug)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -1810,91 +1830,94 @@ retry:
case PAGE_SUCCESS:
stat->nr_pageout += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
goto keep;
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the page.
+ * ahead and try to reclaim the folio.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto keep;
- if (PageDirty(page) || PageWriteback(page))
+ if (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
goto keep_locked;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
fallthrough;
case PAGE_CLEAN:
- ; /* try to free the page below */
+ ; /* try to free the folio below */
}
}
/*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we try to free
- * the page as well.
+ * If the folio has buffers, try to free the buffer
+ * mappings associated with this folio. If we succeed
+ * we try to free the folio as well.
*
- * We do this even if the page is PageDirty().
- * try_to_release_page() does not perform I/O, but it is
- * possible for a page to have PageDirty set, but it is actually
- * clean (all its buffers are clean). This happens if the
- * buffers were written out directly, with submit_bh(). ext3
- * will do this, as well as the blockdev mapping.
- * try_to_release_page() will discover that cleanness and will
- * drop the buffers and mark the page clean - it can be freed.
+ * We do this even if the folio is dirty.
+ * filemap_release_folio() does not perform I/O, but it
+ * is possible for a folio to have the dirty flag set,
+ * but it is actually clean (all its buffers are clean).
+ * This happens if the buffers were written out directly,
+ * with submit_bh(). ext3 will do this, as well as
+ * the blockdev mapping. filemap_release_folio() will
+ * discover that cleanness and will drop the buffers
+ * and mark the folio clean - it can be freed.
*
- * Rarely, pages can have buffers and no ->mapping. These are
- * the pages which were not successfully invalidated in
- * truncate_cleanup_page(). We try to drop those buffers here
- * and if that worked, and the page is no longer mapped into
- * process address space (page_count == 1) it can be freed.
- * Otherwise, leave the page on the LRU so it is swappable.
+ * Rarely, folios can have buffers and no ->mapping.
+ * These are the folios which were not successfully
+ * invalidated in truncate_cleanup_folio(). We try to
+ * drop those buffers here and if that worked, and the
+ * folio is no longer mapped into process address space
+ * (refcount == 1) it can be freed. Otherwise, leave
+ * the folio on the LRU so it is swappable.
*/
- if (page_has_private(page)) {
- if (!try_to_release_page(page, sc->gfp_mask))
+ if (folio_has_private(folio)) {
+ if (!filemap_release_folio(folio, sc->gfp_mask))
goto activate_locked;
- if (!mapping && page_count(page) == 1) {
- unlock_page(page);
- if (put_page_testzero(page))
+ if (!mapping && folio_ref_count(folio) == 1) {
+ folio_unlock(folio);
+ if (folio_put_testzero(folio))
goto free_it;
else {
/*
* rare race with speculative reference.
* the speculative reference will free
- * this page shortly, so we may
+ * this folio shortly, so we may
* increment nr_reclaimed here (and
* leave it off the LRU).
*/
- nr_reclaimed++;
+ nr_reclaimed += nr_pages;
continue;
}
}
}
- if (PageAnon(page) && !PageSwapBacked(page)) {
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
/* follow __remove_mapping for reference */
- if (!page_ref_freeze(page, 1))
+ if (!folio_ref_freeze(folio, 1))
goto keep_locked;
/*
- * The page has only one reference left, which is
+ * The folio has only one reference left, which is
* from the isolation. After the caller puts the
- * page back on lru and drops the reference, the
- * page will be freed anyway. It doesn't matter
- * which lru it goes. So we don't bother checking
- * PageDirty here.
+ * folio back on the lru and drops the reference, the
+ * folio will be freed anyway. It doesn't matter
+ * which lru it goes on. So we don't bother checking
+ * the dirty flag here.
*/
- count_vm_event(PGLAZYFREED);
- count_memcg_page_event(page, PGLAZYFREED);
+ count_vm_events(PGLAZYFREED, nr_pages);
+ count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
} else if (!mapping || !__remove_mapping(mapping, folio, true,
sc->target_mem_cgroup))
goto keep_locked;
- unlock_page(page);
+ folio_unlock(folio);
free_it:
/*
- * THP may get swapped out in a whole, need account
- * all base pages.
+ * Folio may get swapped out as a whole, need to account
+ * all pages in it.
*/
nr_reclaimed += nr_pages;
@@ -1902,10 +1925,10 @@ free_it:
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page)))
- destroy_compound_page(page);
+ if (unlikely(folio_test_large(folio)))
+ destroy_compound_page(&folio->page);
else
- list_add(&page->lru, &free_pages);
+ list_add(&folio->lru, &free_pages);
continue;
activate_locked_split:
@@ -1919,29 +1942,31 @@ activate_locked_split:
}
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
- PageMlocked(page)))
- try_to_free_swap(page);
- VM_BUG_ON_PAGE(PageActive(page), page);
- if (!PageMlocked(page)) {
- int type = page_is_file_lru(page);
- SetPageActive(page);
+ if (folio_test_swapcache(folio) &&
+ (mem_cgroup_swap_full(&folio->page) ||
+ folio_test_mlocked(folio)))
+ try_to_free_swap(&folio->page);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+ if (!folio_test_mlocked(folio)) {
+ int type = folio_is_file_lru(folio);
+ folio_set_active(folio);
stat->nr_activate[type] += nr_pages;
- count_memcg_page_event(page, PGACTIVATE);
+ count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
}
keep_locked:
- unlock_page(page);
+ folio_unlock(folio);
keep:
- list_add(&page->lru, &ret_pages);
- VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
+ list_add(&folio->lru, &ret_pages);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
+ folio_test_unevictable(folio), folio);
}
/* 'page_list' is always empty here */
- /* Migrate pages selected for demotion */
+ /* Migrate folios selected for demotion */
nr_reclaimed += demote_page_list(&demote_pages, pgdat);
- /* Pages that could not be demoted are still in @demote_pages */
+ /* Folios that could not be demoted are still in @demote_pages */
if (!list_empty(&demote_pages)) {
- /* Pages which failed to demoted go back on @page_list for retry: */
+ /* Folios which weren't demoted go back on @page_list for retry: */
list_splice_init(&demote_pages, page_list);
do_demote_pass = false;
goto retry;
@@ -1956,6 +1981,8 @@ keep:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ if (plug)
+ swap_write_unplug(plug);
return nr_reclaimed;
}
@@ -2117,8 +2144,8 @@ move:
* Splice any skipped pages to the start of the LRU list. Note that
* this disrupts the LRU order when reclaiming for lower zones but
* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
- * scanning would soon rescan the same pages to skip and put the
- * system at risk of premature OOM.
+ * scanning would soon rescan the same pages to skip and waste lots
+ * of cpu cycles.
*/
if (!list_empty(&pages_skipped)) {
int zid;
@@ -2297,10 +2324,9 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
}
/*
- * If a kernel thread (such as nfsd for loop-back mounts) services
- * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
- * In that case we should only throttle if the backing device it is
- * writing to is congested. In other cases it is safe to throttle.
+ * If a kernel thread (such as nfsd for loop-back mounts) services a backing
+ * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
+ * we should not throttle. Otherwise it is safe to do so.
*/
static int current_may_throttle(void)
{
@@ -2472,8 +2498,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
}
+ /* Referenced or rmap lock contention: rotate */
if (folio_referenced(folio, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ &vm_flags) != 0) {
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -2517,14 +2544,12 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate, nr_rotated, sc->priority, file);
}
-unsigned long reclaim_pages(struct list_head *page_list)
+static unsigned int reclaim_page_list(struct list_head *page_list,
+ struct pglist_data *pgdat)
{
- int nid = NUMA_NO_NODE;
- unsigned int nr_reclaimed = 0;
- LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
- struct page *page;
- unsigned int noreclaim_flag;
+ unsigned int nr_reclaimed;
+ struct folio *folio;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_writepage = 1,
@@ -2533,14 +2558,32 @@ unsigned long reclaim_pages(struct list_head *page_list)
.no_demotion = 1,
};
+ nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false);
+ while (!list_empty(page_list)) {
+ folio = lru_to_folio(page_list);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ return nr_reclaimed;
+}
+
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid;
+ unsigned int nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct page *page;
+ unsigned int noreclaim_flag;
+
+ if (list_empty(page_list))
+ return nr_reclaimed;
+
noreclaim_flag = memalloc_noreclaim_save();
- while (!list_empty(page_list)) {
+ nid = page_to_nid(lru_to_page(page_list));
+ do {
page = lru_to_page(page_list);
- if (nid == NUMA_NO_NODE) {
- nid = page_to_nid(page);
- INIT_LIST_HEAD(&node_page_list);
- }
if (nid == page_to_nid(page)) {
ClearPageActive(page);
@@ -2548,28 +2591,11 @@ unsigned long reclaim_pages(struct list_head *page_list)
continue;
}
- nr_reclaimed += shrink_page_list(&node_page_list,
- NODE_DATA(nid),
- &sc, &dummy_stat, false);
- while (!list_empty(&node_page_list)) {
- page = lru_to_page(&node_page_list);
- list_del(&page->lru);
- putback_lru_page(page);
- }
-
- nid = NUMA_NO_NODE;
- }
+ nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
+ nid = page_to_nid(lru_to_page(page_list));
+ } while (!list_empty(page_list));
- if (!list_empty(&node_page_list)) {
- nr_reclaimed += shrink_page_list(&node_page_list,
- NODE_DATA(nid),
- &sc, &dummy_stat, false);
- while (!list_empty(&node_page_list)) {
- page = lru_to_page(&node_page_list);
- list_del(&page->lru);
- putback_lru_page(page);
- }
- }
+ nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2646,9 +2672,7 @@ enum scan_balance {
/*
* Determine how aggressively the anon and file LRU lists should be
- * scanned. The relative value of each set of LRU lists is determined
- * by looking at the fraction of the pages scanned we did rotate back
- * onto the active list instead of evict.
+ * scanned.
*
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
@@ -3912,7 +3936,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
}
/*
- * If a node has no populated zone within highest_zoneidx, it does not
+ * If a node has no managed zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
@@ -4552,7 +4576,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
/*
* This kswapd start function will be called by init and node-hot-add.
- * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
void kswapd_run(int nid)
{
@@ -4699,7 +4722,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
noreclaim_flag = memalloc_noreclaim_save();
set_task_reclaim_state(p, &sc.reclaim_state);
- if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
+ if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
/*
* Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.