diff options
author | Joao Martins <joao.m.martins@oracle.com> | 2023-10-18 19:31:08 -0700 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2023-10-25 16:47:07 -0700 |
commit | f4b7e3efaddbf8fa7cffacb5956b0823b7a7730a (patch) | |
tree | 2194e67c49ed25d81b111300264588b471cba05c /mm/hugetlb_vmemmap.c | |
parent | 91f386bf0772c1976622bd0b119b78094603c3d0 (diff) | |
download | lwn-f4b7e3efaddbf8fa7cffacb5956b0823b7a7730a.tar.gz lwn-f4b7e3efaddbf8fa7cffacb5956b0823b7a7730a.zip |
hugetlb: batch PMD split for bulk vmemmap dedup
In an effort to minimize amount of TLB flushes, batch all PMD splits
belonging to a range of pages in order to perform only 1 (global) TLB
flush.
Add a flags field to the walker and pass whether it's a bulk allocation or
just a single page to decide to remap. First value
(VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB
flush when we split the PMD.
Rebased and updated by Mike Kravetz
Link: https://lkml.kernel.org/r/20231019023113.345257-7-mike.kravetz@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Konrad Dybcio <konradybcio@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Usama Arif <usama.arif@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb_vmemmap.c')
-rw-r--r-- | mm/hugetlb_vmemmap.c | 92 |
1 files changed, 88 insertions, 4 deletions
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4ac521e596db..10739e4285d5 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -27,6 +27,8 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. + * @flags: used to modify behavior in vmemmap page table walking + * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -35,9 +37,13 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; + +/* Skip the TLB flush when we split the PMD */ +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) + unsigned long flags; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) { pmd_t __pmd; int i; @@ -80,7 +86,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + if (flush) + flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -127,11 +134,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, do { int ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); if (ret) return ret; next = pmd_addr_end(addr, end); + + /* + * We are only splitting, not remapping the hugetlb vmemmap + * pages. + */ + if (!walk->remap_pte) + continue; + vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -198,7 +214,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + if (walk->remap_pte) + flush_tlb_kernel_range(start, end); return 0; } @@ -298,6 +315,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, } /** + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) + * backing PMDs of the directmap into PTEs + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_split(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + struct vmemmap_remap_walk walk = { + .remap_pte = NULL, + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return ret; +} + +/** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. @@ -320,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -368,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); @@ -419,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, + .flags = 0, }; /* See the comment in the vmemmap_remap_free(). */ @@ -628,12 +678,46 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) free_vmemmap_page_list(&vmemmap_pages); } +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!vmemmap_should_optimize(h, head)) + return 0; + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, + * @vmemmap_end] + */ + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); +} + void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; LIST_HEAD(vmemmap_pages); list_for_each_entry(folio, folio_list, lru) { + int ret = hugetlb_vmemmap_split(h, &folio->page); + + /* + * Spliting the PMD requires allocating a page, thus lets fail + * early once we encounter the first OOM. No point in retrying + * as it can be dynamically done on remap with the memory + * we get back from the vmemmap deduplication. + */ + if (ret == -ENOMEM) + break; + } + + flush_tlb_all(); + + list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); |