summaryrefslogtreecommitdiff
path: root/mm/swap_state.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2023-10-19 13:39:08 -0700
committerAndrew Morton <akpm@linux-foundation.org>2023-10-25 16:47:16 -0700
commitddc1a5cbc05dc62743a2f409b96faa5cf95ba064 (patch)
tree6d3b7083fd984acd1391818a32ef9a7dd46049e7 /mm/swap_state.c
parent23e4883248f0472d806c8b3422ba6257e67bf1a5 (diff)
downloadlwn-ddc1a5cbc05dc62743a2f409b96faa5cf95ba064.tar.gz
lwn-ddc1a5cbc05dc62743a2f409b96faa5cf95ba064.zip
mempolicy: alloc_pages_mpol() for NUMA policy without vma
Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r--mm/swap_state.c86
1 files changed, 54 insertions, 32 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ab79ffb71736..85d9e5806a6a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
+#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/init.h>
@@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
}
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr,
- bool *new_page_allocated)
+ struct mempolicy *mpol, pgoff_t ilx,
+ bool *new_page_allocated)
{
struct swap_info_struct *si;
struct folio *folio;
@@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
* cause any racers to loop around until we add it to cache.
*/
- folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
+ folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
+ mpol, ilx, numa_node_id());
if (!folio)
goto fail_put_swap;
@@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
unsigned long addr, struct swap_iocb **plug)
{
- bool page_was_allocated;
- struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
- vma, addr, &page_was_allocated);
+ bool page_allocated;
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
- if (page_was_allocated)
- swap_readpage(retpage, false, plug);
+ mpol = get_vma_policy(vma, addr, 0, &ilx);
+ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
+ mpol_cond_put(mpol);
- return retpage;
+ if (page_allocated)
+ swap_readpage(page, false, plug);
+ return page;
}
static unsigned int __swapin_nr_pages(unsigned long prev_offset,
@@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* swap_cluster_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
* @gfp_mask: memory allocation flags
- * @vmf: fault information
+ * @mpol: NUMA memory allocation policy to be applied
+ * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
*
* Returns the struct page for entry and addr, after queueing swapin.
*
@@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* because it doesn't cost us any seek time. We also make sure to queue
* the 'original' request together with the readahead ones...
*
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold read mmap_lock if vmf->vma is not NULL.
+ * Note: it is intentional that the same NUMA policy and interleave index
+ * are used for every page of the readahead: neighbouring pages on swap
+ * are fairly likely to have been swapped out from the same node.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_fault *vmf)
+ struct mempolicy *mpol, pgoff_t ilx)
{
struct page *page;
unsigned long entry_offset = swp_offset(entry);
@@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct blk_plug plug;
struct swap_iocb *splug = NULL;
bool page_allocated;
- struct vm_area_struct *vma = vmf->vma;
- unsigned long addr = vmf->address;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
@@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
page = __read_swap_cache_async(
- swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr, &page_allocated);
+ swp_entry(swp_type(entry), offset),
+ gfp_mask, mpol, ilx, &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
}
blk_finish_plug(&plug);
swap_read_unplug(splug);
-
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
/* The page was likely read above, so no need for plugging here */
- return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL);
+ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
+ if (unlikely(page_allocated))
+ swap_readpage(page, false, NULL);
+ return page;
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf,
/**
* swap_vma_readahead - swap in pages in hope we need them soon
- * @fentry: swap entry of this memory
+ * @targ_entry: swap entry of the targeted memory
* @gfp_mask: memory allocation flags
+ * @mpol: NUMA memory allocation policy to be applied
+ * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
* @vmf: fault information
*
* Returns the struct page for entry and addr, after queueing swapin.
@@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf,
* Caller must hold read mmap_lock if vmf->vma is not NULL.
*
*/
-static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
+ struct mempolicy *mpol, pgoff_t targ_ilx,
struct vm_fault *vmf)
{
struct blk_plug plug;
struct swap_iocb *splug = NULL;
- struct vm_area_struct *vma = vmf->vma;
struct page *page;
pte_t *pte = NULL, pentry;
unsigned long addr;
swp_entry_t entry;
+ pgoff_t ilx;
unsigned int i;
bool page_allocated;
struct vma_swap_readahead ra_info = {
@@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
goto skip;
addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+ ilx = targ_ilx - ra_info.offset;
blk_start_plug(&plug);
- for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+ for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) {
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
continue;
pte_unmap(pte);
pte = NULL;
- page = __read_swap_cache_async(entry, gfp_mask, vma,
- addr, &page_allocated);
+ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+ &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
lru_add_drain();
skip:
/* The page was likely read above, so no need for plugging here */
- return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- NULL);
+ page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
+ &page_allocated);
+ if (unlikely(page_allocated))
+ swap_readpage(page, false, NULL);
+ return page;
}
/**
@@ -853,9 +868,16 @@ skip:
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
- return swap_use_vma_readahead() ?
- swap_vma_readahead(entry, gfp_mask, vmf) :
- swap_cluster_readahead(entry, gfp_mask, vmf);
+ struct mempolicy *mpol;
+ pgoff_t ilx;
+ struct page *page;
+
+ mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
+ page = swap_use_vma_readahead() ?
+ swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
+ swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
+ mpol_cond_put(mpol);
+ return page;
}
#ifdef CONFIG_SYSFS