summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c540
1 files changed, 280 insertions, 260 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76378bc257e3..c767d71c43d7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
-
+ /*
+ * If there are no reclaimable file-backed or anonymous pages,
+ * ensure zones with sufficient free pages are not skipped.
+ * This prevents zones like DMA32 from being ignored in reclaim
+ * scenarios where they can still help alleviate memory pressure.
+ */
+ if (nr == 0)
+ nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
return nr;
}
@@ -855,6 +862,31 @@ enum folio_references {
FOLIOREF_ACTIVATE,
};
+#ifdef CONFIG_LRU_GEN
+/*
+ * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
+ * needs to be done by taking the folio off the LRU list and then adding it back
+ * with PG_active set. In contrast, the aging (page table walk) path uses
+ * folio_update_gen().
+ */
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return false;
+ }
+
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
+ return true;
+}
+#else
+static bool lru_gen_set_refs(struct folio *folio)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
@@ -863,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio,
referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
&vm_flags);
- referenced_folio = folio_test_clear_referenced(folio);
/*
* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
@@ -881,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio,
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
+ if (lru_gen_enabled()) {
+ if (!referenced_ptes)
+ return FOLIOREF_RECLAIM;
+
+ return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
+ }
+
+ referenced_folio = folio_test_clear_referenced(folio);
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -1046,7 +1086,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
- unsigned int nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
struct swap_iocb *plug = NULL;
@@ -1085,11 +1125,6 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- /* folio_update_gen() tried to promote this page? */
- if (lru_gen_enabled() && !ignore_references &&
- folio_mapped(folio) && folio_test_referenced(folio))
- goto keep_locked;
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1515,8 +1550,9 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
- nr_reclaimed += stat->nr_demoted;
+ nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += nr_demoted;
+ stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -1657,6 +1693,7 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
+ unsigned long max_nr_skipped = 0;
LIST_HEAD(folios_skipped);
total_scan = 0;
@@ -1671,9 +1708,12 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ /* Using max_nr_skipped to prevent hard LOCKUP*/
+ if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
+ (folio_zonenum(folio) > sc->reclaim_idx)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
+ max_nr_skipped++;
goto move;
}
@@ -2612,11 +2652,17 @@ static bool should_clear_pmd_young(void)
READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
}
+#define evictable_min_seq(min_seq, swappiness) \
+ min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
+
#define for_each_gen_type_zone(gen, type, zone) \
for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define for_each_evictable_type(type, swappiness) \
+ for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
+
#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
@@ -2662,10 +2708,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_folio */
- return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+ int type;
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ int n = get_nr_gens(lruvec, type);
+
+ if (n < MIN_NR_GENS || n > MAX_NR_GENS)
+ return false;
+ }
+
+ return true;
}
/******************************************************************************
@@ -3066,16 +3118,20 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
+ int i;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- pos->refaulted = lrugen->avg_refaulted[type][tier] +
- atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- pos->total = lrugen->avg_total[type][tier] +
- atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- pos->total += lrugen->protected[hist][type][tier - 1];
pos->gain = gain;
+ pos->refaulted = pos->total = 0;
+
+ for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ pos->refaulted += lrugen->avg_refaulted[type][i] +
+ atomic_long_read(&lrugen->refaulted[hist][type][i]);
+ pos->total += lrugen->avg_total[type][i] +
+ lrugen->protected[hist][type][i] +
+ atomic_long_read(&lrugen->evicted[hist][type][i]);
+ }
}
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
@@ -3101,17 +3157,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
sum = lrugen->avg_total[type][tier] +
+ lrugen->protected[hist][type][tier] +
atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- sum += lrugen->protected[hist][type][tier - 1];
WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
}
if (clear) {
atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- if (tier)
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
}
}
}
@@ -3138,16 +3192,19 @@ static int folio_update_gen(struct folio *folio, int gen)
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ /* see the comment on LRU_REFS_FLAGS */
+ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
+ return -1;
+ }
+
do {
/* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
+ if (!(old_flags & LRU_GEN_MASK))
+ return -1;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
+ new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
@@ -3171,7 +3228,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
new_gen = (old_gen + 1) % MAX_NR_GENS;
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
/* for folio_end_writeback() */
if (reclaiming)
@@ -3246,7 +3303,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (vma_is_anonymous(vma))
- return !walk->can_swap;
+ return !walk->swappiness;
if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
return true;
@@ -3256,7 +3313,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
return true;
if (shmem_mapping(mapping))
- return !walk->can_swap;
+ return !walk->swappiness;
+
+ if (walk->swappiness > MAX_SWAPPINESS)
+ return true;
/* to exclude special mappings like dax, etc. */
return !mapping->a_ops->read_folio;
@@ -3344,19 +3404,17 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
}
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
+ struct pglist_data *pgdat)
{
- struct folio *folio;
+ struct folio *folio = pfn_folio(pfn);
- folio = pfn_folio(pfn);
- if (folio_nid(folio) != pgdat->node_id)
+ if (folio_lru_gen(folio) < 0)
return NULL;
- if (folio_memcg(folio) != memcg)
+ if (folio_nid(folio) != pgdat->node_id)
return NULL;
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
+ if (folio_memcg(folio) != memcg)
return NULL;
return folio;
@@ -3370,29 +3428,55 @@ static bool suitable_to_scan(int total, int young)
return young * n >= total;
}
+static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int new_gen, bool dirty)
+{
+ int old_gen;
+
+ if (!folio)
+ return;
+
+ if (dirty && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ } else if (lru_gen_set_refs(folio)) {
+ old_gen = folio_lru_gen(folio);
+ if (old_gen >= 0 && old_gen != new_gen)
+ folio_activate(folio);
+ }
+}
+
static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct mm_walk *args)
{
int i;
+ bool dirty;
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
int total = 0;
int young = 0;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
pmd_t pmdval;
- pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval,
- &ptl);
+ pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
if (!pte)
return false;
+
if (!spin_trylock(ptl)) {
pte_unmap(pte);
- return false;
+ return true;
}
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
@@ -3414,26 +3498,30 @@ restart:
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(args->vma, addr, pte + i))
continue;
- young++;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
}
+ walk_update_folio(walk, last, gen, dirty);
+ last = NULL;
+
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
@@ -3447,13 +3535,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
int i;
+ bool dirty;
pmd_t *pmd;
spinlock_t *ptl;
+ struct folio *last = NULL;
struct lru_gen_mm_walk *walk = args->private;
struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
DEFINE_MAX_SEQ(walk->lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3499,27 +3589,30 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
if (pfn == -1)
goto next;
- folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
goto next;
if (!pmdp_clear_young_notify(vma, addr, pmd + i))
goto next;
- walk->mm_stats[MM_LEAF_YOUNG]++;
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
+ last = folio;
+ dirty = false;
+ }
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (pmd_dirty(pmd[i]))
+ dirty = true;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
next:
i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
} while (i <= MIN_LRU_BATCH);
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
spin_unlock(ptl);
done:
@@ -3711,22 +3804,25 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
{
int zone;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- if (type == LRU_GEN_ANON && !can_swap)
+ if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
goto done;
- /* prevent cold/hot inversion if force_scan is true */
+ /* prevent cold/hot inversion if the type is evictable */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
+ int refs = folio_lru_refs(folio);
+ bool workingset = folio_test_workingset(folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
@@ -3736,6 +3832,15 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
new_gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int tier = lru_tier_from_refs(refs, workingset);
+ int delta = folio_nr_pages(folio);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
+
if (!--remaining)
return false;
}
@@ -3747,7 +3852,7 @@ done:
return true;
}
-static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
{
int gen, type, zone;
bool success = false;
@@ -3757,7 +3862,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
/* find the oldest populated generation */
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
gen = lru_gen_from_seq(min_seq[type]);
@@ -3773,13 +3878,17 @@ next:
}
/* see the comment on lru_gen_folio */
- if (can_swap) {
- min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ if (swappiness && swappiness <= MAX_SWAPPINESS) {
+ unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
+
+ if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
+ min_seq[LRU_GEN_ANON] = seq;
+ else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
+ min_seq[LRU_GEN_FILE] = seq;
}
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
- if (min_seq[type] == lrugen->min_seq[type])
+ for_each_evictable_type(type, swappiness) {
+ if (min_seq[type] <= lrugen->min_seq[type])
continue;
reset_ctrl_pos(lruvec, type, true);
@@ -3790,8 +3899,7 @@ next:
return success;
}
-static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
{
bool success;
int prev, next;
@@ -3809,13 +3917,11 @@ restart:
if (!success)
goto unlock;
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ for (type = 0; type < ANON_AND_FILE; type++) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
-
- if (inc_min_seq(lruvec, type, can_swap))
+ if (inc_min_seq(lruvec, type, swappiness))
continue;
spin_unlock_irq(&lruvec->lru_lock);
@@ -3859,7 +3965,7 @@ unlock:
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -3870,7 +3976,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
if (!mm_state)
- return inc_max_seq(lruvec, seq, can_swap, force_scan);
+ return inc_max_seq(lruvec, seq, swappiness);
/* see the comment in iterate_mm_list() */
if (seq <= READ_ONCE(mm_state->seq))
@@ -3895,7 +4001,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
walk->lruvec = lruvec;
walk->seq = seq;
- walk->can_swap = can_swap;
+ walk->swappiness = swappiness;
walk->force_scan = force_scan;
do {
@@ -3905,7 +4011,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
} while (mm);
done:
if (success) {
- success = inc_max_seq(lruvec, seq, can_swap, force_scan);
+ success = inc_max_seq(lruvec, seq, swappiness);
WARN_ON_ONCE(!success);
}
@@ -3946,13 +4052,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
unsigned long total = 0;
- bool can_swap = get_swappiness(lruvec, sc);
+ int swappiness = get_swappiness(lruvec, sc);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
@@ -3972,6 +4078,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
{
int gen;
unsigned long birth;
+ int swappiness = get_swappiness(lruvec, sc);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MIN_SEQ(lruvec);
@@ -3981,8 +4088,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
if (!lruvec_is_sizable(lruvec, sc))
return false;
- /* see the comment on lru_gen_folio */
- gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return time_is_before_jiffies(birth + min_ttl);
@@ -4041,21 +4147,22 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
+ bool dirty;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
+ struct folio *last = NULL;
int young = 1;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
- bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4102,37 +4209,28 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (pfn == -1)
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat);
if (!folio)
continue;
if (!ptep_clear_young_notify(vma, addr, pte + i))
continue;
- young++;
-
- if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
- !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio)))
- folio_mark_dirty(folio);
-
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
+ if (last != folio) {
+ walk_update_folio(walk, last, gen, dirty);
- continue;
+ last = folio;
+ dirty = false;
}
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen) {
- folio_clear_lru_refs(folio);
- folio_activate(folio);
- }
+ if (pte_dirty(ptent))
+ dirty = true;
+
+ young++;
}
+ walk_update_folio(walk, last, gen, dirty);
+
arch_leave_lazy_mmu_mode();
/* feedback from rmap walkers to page table walkers */
@@ -4290,7 +4388,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
int zone = folio_zonenum(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
- int tier = lru_tier_from_refs(refs);
+ bool workingset = folio_test_workingset(folio);
+ int tier = lru_tier_from_refs(refs, workingset);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4312,14 +4411,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
-
+ if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+
+ /* don't count the workingset being lazily promoted */
+ if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- lrugen->protected[hist][type][tier - 1] + delta);
+ WRITE_ONCE(lrugen->protected[hist][type][tier],
+ lrugen->protected[hist][type][tier] + delta);
+ }
return true;
}
@@ -4339,8 +4441,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* waiting for writeback */
- if (folio_test_locked(folio) || writeback ||
- (type == LRU_GEN_FILE && dirty)) {
+ if (writeback || (type == LRU_GEN_FILE && dirty)) {
gen = folio_inc_gen(lruvec, folio, true);
list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
@@ -4369,13 +4470,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
return false;
}
- /* see the comment on MAX_NR_TIERS */
+ /* see the comment on LRU_REFS_FLAGS */
if (!folio_test_referenced(folio))
- folio_clear_lru_refs(folio);
+ set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
/* for shrink_folio_list() */
folio_clear_reclaim(folio);
- folio_clear_referenced(folio);
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
@@ -4471,13 +4571,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
struct ctrl_pos sp, pv;
/*
- * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * To leave a margin for fluctuations, use a larger gain factor (2:3).
* This value is chosen because any other tier would have at least twice
* as many refaults as the first tier.
*/
- read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ read_ctrl_pos(lruvec, type, 0, 2, &sp);
for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ read_ctrl_pos(lruvec, type, tier, 3, &pv);
if (!positive_ctrl_err(&sp, &pv))
break;
}
@@ -4485,79 +4585,45 @@ static int get_tier_idx(struct lruvec *lruvec, int type)
return tier - 1;
}
-static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
{
- int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
+ if (swappiness <= MIN_SWAPPINESS + 1)
+ return LRU_GEN_FILE;
+
+ if (swappiness >= MAX_SWAPPINESS)
+ return LRU_GEN_ANON;
/*
- * Compare the first tier of anon with that of file to determine which
- * type to scan. Also need to compare other tiers of the selected type
- * with the first tier of the other type to determine the last tier (of
- * the selected type) to evict.
+ * Compare the sum of all tiers of anon with that of file to determine
+ * which type to scan.
*/
- read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- type = positive_ctrl_err(&sp, &pv);
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
- read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- if (!positive_ctrl_err(&sp, &pv))
- break;
- }
-
- *tier_idx = tier - 1;
-
- return type;
+ return positive_ctrl_err(&sp, &pv);
}
static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
int *type_scanned, struct list_head *list)
{
int i;
- int type;
- int scanned;
- int tier = -1;
- DEFINE_MIN_SEQ(lruvec);
+ int type = get_type_to_scan(lruvec, swappiness);
- /*
- * Try to make the obvious choice first, and if anon and file are both
- * available from the same generation,
- * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
- * first.
- * 2. If !__GFP_IO, file first since clean pagecache is more likely to
- * exist than clean swapcache.
- */
- if (!swappiness)
- type = LRU_GEN_FILE;
- else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- type = LRU_GEN_ANON;
- else if (swappiness == 1)
- type = LRU_GEN_FILE;
- else if (swappiness == MAX_SWAPPINESS)
- type = LRU_GEN_ANON;
- else if (!(sc->gfp_mask & __GFP_IO))
- type = LRU_GEN_FILE;
- else
- type = get_type_to_scan(lruvec, swappiness, &tier);
+ for_each_evictable_type(i, swappiness) {
+ int scanned;
+ int tier = get_tier_idx(lruvec, type);
- for (i = !swappiness; i < ANON_AND_FILE; i++) {
- if (tier < 0)
- tier = get_tier_idx(lruvec, type);
+ *type_scanned = type;
scanned = scan_folios(lruvec, sc, type, tier, list);
if (scanned)
- break;
+ return scanned;
type = !type;
- tier = -1;
}
- *type_scanned = type;
-
- return scanned;
+ return 0;
}
static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
@@ -4573,6 +4639,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
bool skip_retry = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -4582,7 +4649,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
scanned += try_to_inc_min_seq(lruvec, swappiness);
- if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
scanned = 0;
spin_unlock_irq(&lruvec->lru_lock);
@@ -4598,31 +4665,24 @@ retry:
type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ DEFINE_MIN_SEQ(lruvec);
+
if (!folio_evictable(folio)) {
list_del(&folio->lru);
folio_putback_lru(folio);
continue;
}
- if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
- continue;
- }
-
- if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
- folio_mapped(folio) || folio_test_locked(folio) ||
- folio_test_dirty(folio) || folio_test_writeback(folio)) {
- /* don't add rejected folios to the oldest generation */
- set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
- BIT(PG_active));
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
+ !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
+ list_move(&folio->lru, &clean);
continue;
}
- /* retry folios that may have missed folio_rotate_reclaimable() */
- list_move(&folio->lru, &clean);
+ /* don't add rejected folios to the oldest generation */
+ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
+ set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
}
spin_lock_irq(&lruvec->lru_lock);
@@ -4635,6 +4695,9 @@ retry:
reset_batch_size(walk);
}
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ stat.nr_demoted);
+
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
@@ -4654,63 +4717,32 @@ retry:
}
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
- bool can_swap, unsigned long *nr_to_scan)
+ int swappiness, unsigned long *nr_to_scan)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
- unsigned long total = 0;
+ unsigned long size = 0;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
- /* whether this lruvec is completely out of cold folios */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
- *nr_to_scan = 0;
+ *nr_to_scan = 0;
+ /* have to run aging, since eviction is not possible anymore */
+ if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
return true;
- }
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ for_each_evictable_type(type, swappiness) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
}
}
- *nr_to_scan = total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ *nr_to_scan = size;
+ /* better to run aging even though eviction is still possible */
+ return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
}
/*
@@ -4718,7 +4750,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
bool success;
unsigned long nr_to_scan;
@@ -4728,7 +4760,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return -1;
- success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
+ success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
/* try to scrape all its memory if this memcg was deleted */
if (nr_to_scan && !mem_cgroup_online(memcg))
@@ -4739,7 +4771,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return nr_to_scan >> sc->priority;
/* stop scanning this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
}
static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
@@ -5283,8 +5315,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
s = "rep";
n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
- if (tier)
- n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
}
for (i = 0; i < 3; i++)
@@ -5339,7 +5370,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
seq_printf(m, " node %5d\n", nid);
if (!full)
- seq = min_seq[LRU_GEN_ANON];
+ seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
else if (max_seq >= MAX_NR_GENS)
seq = max_seq - MAX_NR_GENS + 1;
else
@@ -5379,23 +5410,14 @@ static const struct seq_operations lru_gen_seq_ops = {
};
static int run_aging(struct lruvec *lruvec, unsigned long seq,
- bool can_swap, bool force_scan)
+ int swappiness, bool force_scan)
{
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
-
- if (seq < max_seq)
- return 0;
if (seq > max_seq)
return -EINVAL;
- if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
- return -ERANGE;
-
- try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
-
- return 0;
+ return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
}
static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
@@ -5411,7 +5433,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
while (!signal_pending(current)) {
DEFINE_MIN_SEQ(lruvec);
- if (seq < min_seq[!swappiness])
+ if (seq < evictable_min_seq(min_seq, swappiness))
return 0;
if (sc->nr_reclaimed >= nr_to_reclaim)
@@ -5456,7 +5478,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > MAX_SWAPPINESS)
+ else if (swappiness > MAX_SWAPPINESS + 1)
goto done;
switch (cmd) {
@@ -7182,10 +7204,6 @@ static int kswapd(void *p)
unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t *)p;
struct task_struct *tsk = current;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-
- if (!cpumask_empty(cpumask))
- set_cpus_allowed_ptr(tsk, cpumask);
/*
* Tell the memory management that we're a "memory allocator",
@@ -7354,13 +7372,15 @@ void __meminit kswapd_run(int nid)
pgdat_kswapd_lock(pgdat);
if (!pgdat->kswapd) {
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
pr_err("Failed to start kswapd on node %d,ret=%ld\n",
nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
pgdat->kswapd = NULL;
+ } else {
+ wake_up_process(pgdat->kswapd);
}
}
pgdat_kswapd_unlock(pgdat);