diff options
author | Hugh Dickins <hughd@google.com> | 2023-10-19 13:39:08 -0700 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2023-10-25 16:47:16 -0700 |
commit | ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 (patch) | |
tree | 6d3b7083fd984acd1391818a32ef9a7dd46049e7 /mm/swap_state.c | |
parent | 23e4883248f0472d806c8b3422ba6257e67bf1a5 (diff) | |
download | lwn-ddc1a5cbc05dc62743a2f409b96faa5cf95ba064.tar.gz lwn-ddc1a5cbc05dc62743a2f409b96faa5cf95ba064.zip |
mempolicy: alloc_pages_mpol() for NUMA policy without vma
Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).
vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node().
Cleanup throughout, cutting out some unhelpful "helpers".
It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.
Hence "ilx" throughout, the "interleave index". Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided. And how ilx is applied depends also on page order.
The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order). vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.
get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g. swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.
[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r-- | mm/swap_state.c | 86 |
1 files changed, 54 insertions, 32 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c index ab79ffb71736..85d9e5806a6a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> +#include <linux/mempolicy.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> @@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) { struct swap_info_struct *si; struct folio *folio; @@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; @@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { - bool page_was_allocated; - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - if (page_was_allocated) - swap_readpage(retpage, false, plug); + mpol = get_vma_policy(vma, addr, 0, &ilx); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + mpol_cond_put(mpol); - return retpage; + if (page_allocated) + swap_readpage(page, false, plug); + return page; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, @@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct page for entry and addr, after queueing swapin. * @@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) + struct mempolicy *mpol, pgoff_t ilx) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated); if (!page) continue; if (page_allocated) { @@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } blk_finish_plug(&plug); swap_read_unplug(splug); - lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf, /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. @@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, +static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { @@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, vma, - addr, &page_allocated); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!page) continue; if (page_allocated) { @@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - NULL); + page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } /** @@ -853,9 +868,16 @@ skip: struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { - return swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; + + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); + page = swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); + return page; } #ifdef CONFIG_SYSFS |