diff options
author | Mel Gorman <mgorman@techsingularity.net> | 2016-07-28 15:45:31 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-28 16:07:41 -0700 |
commit | 599d0c954f91d0689c9bb421b5bc04ea02437a41 (patch) | |
tree | e863ce685841e494bcb63e458739e0939ac684f6 /mm/swap.c | |
parent | a52633d8e9c35832f1409dc5fa166019048a3f1f (diff) | |
download | lwn-599d0c954f91d0689c9bb421b5bc04ea02437a41.tar.gz lwn-599d0c954f91d0689c9bb421b5bc04ea02437a41.zip |
mm, vmscan: move LRU lists to node
This moves the LRU lists from the zone to the node and related data such
as counters, tracing, congestion tracking and writeback tracking.
Unfortunately, due to reclaim and compaction retry logic, it is
necessary to account for the number of LRU pages on both zone and node
logic. Most reclaim logic is based on the node counters but the retry
logic uses the zone counters which do not distinguish inactive and
active sizes. It would be possible to leave the LRU counters on a
per-zone basis but it's a heavier calculation across multiple cache
lines that is much more frequent than the retry checks.
Other than the LRU counters, this is mostly a mechanical patch but note
that it introduces a number of anomalies. For example, the scans are
per-zone but using per-node counters. We also mark a node as congested
when a zone is congested. This causes weird problems that are fixed
later but is easier to review.
In the event that there is excessive overhead on 32-bit systems due to
the nodes being on LRU then there are two potential solutions
1. Long-term isolation of highmem pages when reclaim is lowmem
When pages are skipped, they are immediately added back onto the LRU
list. If lowmem reclaim persisted for long periods of time, the same
highmem pages get continually scanned. The idea would be that lowmem
keeps those pages on a separate list until a reclaim for highmem pages
arrives that splices the highmem pages back onto the LRU. It potentially
could be implemented similar to the UNEVICTABLE list.
That would reduce the skip rate with the potential corner case is that
highmem pages have to be scanned and reclaimed to free lowmem slab pages.
2. Linear scan lowmem pages if the initial LRU shrink fails
This will break LRU ordering but may be preferable and faster during
memory pressure than skipping LRU pages.
Link: http://lkml.kernel.org/r/1467970510-21195-4-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swap.c')
-rw-r--r-- | mm/swap.c | 50 |
1 files changed, 25 insertions, 25 deletions
diff --git a/mm/swap.c b/mm/swap.c index bf37e5cfae81..77af473635fe 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -63,7 +63,7 @@ static void __page_cache_release(struct page *page) unsigned long flags; spin_lock_irqsave(zone_lru_lock(zone), flags); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -194,7 +194,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, spin_lock_irqsave(zone_lru_lock(zone), flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); (*move_fn)(page, lruvec, arg); } if (zone) @@ -319,7 +319,7 @@ void activate_page(struct page *page) page = compound_head(page); spin_lock_irq(zone_lru_lock(zone)); - __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); + __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); spin_unlock_irq(zone_lru_lock(zone)); } #endif @@ -445,16 +445,16 @@ void lru_cache_add(struct page *page) */ void add_page_to_unevictable_list(struct page *page) { - struct zone *zone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(zone_lru_lock(zone)); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(&pgdat->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, pgdat); ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); - spin_unlock_irq(zone_lru_lock(zone)); + spin_unlock_irq(&pgdat->lru_lock); } /** @@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); - struct zone *zone = NULL; + struct pglist_data *locked_pgdat = NULL; struct lruvec *lruvec; unsigned long uninitialized_var(flags); unsigned int uninitialized_var(lock_batch); @@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same zone. The lock is held only if zone != NULL. + * same pgdat. The lock is held only if pgdat != NULL. */ - if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); - zone = NULL; + if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } if (is_huge_zero_page(page)) { @@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold) continue; if (PageCompound(page)) { - if (zone) { - spin_unlock_irqrestore(zone_lru_lock(zone), flags); - zone = NULL; + if (locked_pgdat) { + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + locked_pgdat = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct zone *pagezone = page_zone(page); + struct pglist_data *pgdat = page_pgdat(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irqrestore(zone_lru_lock(zone), + if (pgdat != locked_pgdat) { + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); lock_batch = 0; - zone = pagezone; - spin_lock_irqsave(zone_lru_lock(zone), flags); + locked_pgdat = pgdat; + spin_lock_irqsave(&locked_pgdat->lru_lock, flags); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold) list_add(&page->lru, &pages_to_free); } - if (zone) - spin_unlock_irqrestore(zone_lru_lock(zone), flags); + if (locked_pgdat) + spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); @@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); VM_BUG_ON(NR_CPUS != 1 && - !spin_is_locked(zone_lru_lock(lruvec_zone(lruvec)))); + !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); if (!list) SetPageLRU(page_tail); |