diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 683 |
1 files changed, 401 insertions, 282 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65068671e460..39f92aad7bd1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,9 +14,11 @@ #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> +#include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> +#include <linux/minmax.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> @@ -40,6 +42,7 @@ #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/tlb.h> +#include <asm/setup.h> #include <linux/io.h> #include <linux/hugetlb.h> @@ -48,19 +51,33 @@ #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" +#include "hugetlb_cma.h" #include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; -#ifdef CONFIG_CMA -static struct cma *hugetlb_cma[MAX_NUMNODES]; -static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -#endif -static unsigned long hugetlb_cma_size __initdata; - __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; +static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; + +/* + * Due to ordering constraints across the init code for various + * architectures, hugetlb hstate cmdline parameters can't simply + * be early_param. early_param might call the setup function + * before valid hugetlb page sizes are determined, leading to + * incorrect rejection of valid hugepagesz= options. + * + * So, record the parameters early and consume them whenever the + * init code is ready for them, by calling hugetlb_parse_params(). + */ + +/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */ +#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1) +struct hugetlb_cmdline { + char *val; + int (*setup)(char *val); +}; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; @@ -68,6 +85,21 @@ static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; +static unsigned long hugepage_allocation_threads __initdata; + +static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata; +static int hstate_cmdline_index __initdata; +static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata; +static int hugetlb_param_index __initdata; +static __init int hugetlb_add_param(char *s, int (*setup)(char *val)); +static __init void hugetlb_parse_params(void); + +#define hugetlb_early_param(str, func) \ +static __init int func##args(char *s) \ +{ \ + return hugetlb_add_param(s, func); \ +} \ +early_param(str, func##args) /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -93,12 +125,11 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static void hugetlb_free_folio(struct folio *folio) { -#ifdef CONFIG_CMA - int nid = folio_nid(folio); - - if (cma_free_folio(hugetlb_cma[nid], folio)) + if (folio_test_hugetlb_cma(folio)) { + hugetlb_cma_free_folio(folio); return; -#endif + } + folio_put(folio); } @@ -1455,27 +1486,11 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - folio = NULL; -#ifdef CONFIG_CMA - { - int node; - - if (hugetlb_cma[nid]) - folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - - if (!folio && !(gfp_mask & __GFP_THISNODE)) { - for_each_node_mask(node, *nodemask) { - if (node == nid || !hugetlb_cma[node]) - continue; - - folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); - if (folio) - break; - } - } - } -#endif + folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); if (!folio) { + if (hugetlb_cma_exclusive_alloc()) + return NULL; + folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); if (!folio) return NULL; @@ -1634,7 +1649,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, folio_ref_unfreeze(folio, 1); - INIT_LIST_HEAD(&folio->_deferred_list); hugetlb_free_folio(folio); } @@ -2135,6 +2149,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2157,7 +2173,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2177,7 +2195,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } @@ -2241,12 +2259,21 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; + hugetlb_vmemmap_optimize_folio(h, folio); + spin_lock_irq(&hugetlb_lock); /* + * nr_huge_pages needs to be adjusted within the same lock cycle + * as surplus_pages, otherwise it might confuse + * persistent_huge_pages() momentarily. + */ + __prep_account_new_huge_page(h, nid); + + /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse @@ -2943,6 +2970,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) return ret; } +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv @@ -3136,6 +3171,56 @@ out_end_reservation: return ERR_PTR(-ENOSPC); } +static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact) +{ + struct huge_bootmem_page *m; + int listnode = nid; + + if (hugetlb_early_cma(h)) + m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact); + else { + if (node_exact) + m = memblock_alloc_exact_nid_raw(huge_page_size(h), + huge_page_size(h), 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); + else { + m = memblock_alloc_try_nid_raw(huge_page_size(h), + huge_page_size(h), 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid); + /* + * For pre-HVO to work correctly, pages need to be on + * the list for the node they were actually allocated + * from. That node may be different in the case of + * fallback by memblock_alloc_try_nid_raw. So, + * extract the actual node first. + */ + if (m) + listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m))); + } + + if (m) { + m->flags = 0; + m->cma = NULL; + } + } + + if (m) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + * + * Put them into a private list first because mem_map + * is not up yet. + */ + INIT_LIST_HEAD(&m->list); + list_add(&m->list, &huge_boot_pages[listnode]); + m->hstate = h; + } + + return m; +} + int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) @@ -3145,22 +3230,15 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), - 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + m = alloc_bootmem(h, node, true); if (!m) return 0; goto found; } + /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) { - m = memblock_alloc_try_nid_raw( - huge_page_size(h), huge_page_size(h), - 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); - /* - * Use the beginning of the huge page to store the - * huge_bootmem_page struct (until gather_bootmem - * puts them into the mem_map). - */ + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { + m = alloc_bootmem(h, node, false); if (!m) return 0; goto found; @@ -3177,10 +3255,7 @@ found: */ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), huge_page_size(h) - PAGE_SIZE); - /* Put them into a private list first because mem_map is not up yet */ - INIT_LIST_HEAD(&m->list); - list_add(&m->list, &huge_boot_pages[node]); - m->hstate = h; + return 1; } @@ -3198,7 +3273,6 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); - __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3222,6 +3296,42 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, prep_compound_head((struct page *)folio, huge_page_order(h)); } +static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) +{ + return m->flags & HUGE_BOOTMEM_HVO; +} + +static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m) +{ + return m->flags & HUGE_BOOTMEM_CMA; +} + +/* + * memblock-allocated pageblocks might not have the migrate type set + * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE) + * here, or MIGRATE_CMA if this was a page allocated through an early CMA + * reservation. + * + * In case of vmemmap optimized folios, the tail vmemmap pages are mapped + * read-only, but that's ok - for sparse vmemmap this does not write to + * the page structure. + */ +static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, + struct hstate *h) +{ + unsigned long nr_pages = pages_per_huge_page(h), i; + + WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio))); + + for (i = 0; i < nr_pages; i += pageblock_nr_pages) { + if (folio_test_hugetlb_cma(folio)) + init_cma_pageblock(folio_page(folio, i)); + else + set_pageblock_migratetype(folio_page(folio, i), + MIGRATE_MOVABLE); + } +} + static void __init prep_and_add_bootmem_folios(struct hstate *h, struct list_head *folio_list) { @@ -3229,7 +3339,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ - hugetlb_vmemmap_optimize_folios(h, folio_list); + hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { @@ -3243,6 +3353,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } + hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); @@ -3251,6 +3362,57 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, } } +bool __init hugetlb_bootmem_page_zones_valid(int nid, + struct huge_bootmem_page *m) +{ + unsigned long start_pfn; + bool valid; + + if (m->flags & HUGE_BOOTMEM_ZONES_VALID) { + /* + * Already validated, skip check. + */ + return true; + } + + if (hugetlb_bootmem_page_earlycma(m)) { + valid = cma_validate_zones(m->cma); + goto out; + } + + start_pfn = virt_to_phys(m) >> PAGE_SHIFT; + + valid = !pfn_range_intersects_zones(nid, start_pfn, + pages_per_huge_page(m->hstate)); +out: + if (!valid) + hstate_boot_nrinvalid[hstate_index(m->hstate)]++; + + return valid; +} + +/* + * Free a bootmem page that was found to be invalid (intersecting with + * multiple zones). + * + * Since it intersects with multiple zones, we can't just do a free + * operation on all pages at once, but instead have to walk all + * pages, freeing them one by one. + */ +static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page, + struct hstate *h) +{ + unsigned long npages = pages_per_huge_page(h); + unsigned long pfn; + + while (npages--) { + pfn = page_to_pfn(page); + __init_page_from_nid(pfn, nid); + free_reserved_page(page); + page++; + } +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. @@ -3258,14 +3420,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); - struct huge_bootmem_page *m; + struct huge_bootmem_page *m, *tm; struct hstate *h = NULL, *prev_h = NULL; - list_for_each_entry(m, &huge_boot_pages[nid], list) { + list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; h = m->hstate; + if (!hugetlb_bootmem_page_zones_valid(nid, m)) { + /* + * Can't use this page. Initialize the + * page structures if that hasn't already + * been done, and give them to the page + * allocator. + */ + hugetlb_bootmem_free_invalid_page(nid, page, h); + continue; + } + /* * It is possible to have multiple huge page sizes (hstates) * in this list. If so, process each size separately. @@ -3280,14 +3453,30 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); init_new_hugetlb_folio(h, folio); + + if (hugetlb_bootmem_page_prehvo(m)) + /* + * If pre-HVO was done, just set the + * flag, the HVO code will then skip + * this folio. + */ + folio_set_hugetlb_vmemmap_optimized(folio); + + if (hugetlb_bootmem_page_earlycma(m)) + folio_set_hugetlb_cma(folio); + list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages * in order to fix confusing memory reports from free(1) and * other side-effects, like CommitLimit going negative. + * + * For CMA pages, this is done in init_cma_pageblock + * (via hugetlb_bootmem_init_migratetype), so skip it here. */ - adjust_managed_page_count(page, pages_per_huge_page(h)); + if (!folio_test_hugetlb_cma(folio)) + adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } @@ -3427,32 +3616,44 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) .numa_aware = true }; + unsigned long jiffies_start; + unsigned long jiffies_end; + job.thread_fn = hugetlb_pages_alloc_boot_node; job.start = 0; job.size = h->max_huge_pages; /* - * job.max_threads is twice the num_node_state(N_MEMORY), + * job.max_threads is 25% of the available cpu threads by default. * - * Tests below indicate that a multiplier of 2 significantly improves - * performance, and although larger values also provide improvements, - * the gains are marginal. + * On large servers with terabytes of memory, huge page allocation + * can consume a considerably amount of time. * - * Therefore, choosing 2 as the multiplier strikes a good balance between - * enhancing parallel processing capabilities and maintaining efficient - * resource management. + * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. + * 2MiB huge pages. Using more threads can significantly improve allocation time. * - * +------------+-------+-------+-------+-------+-------+ - * | multiplier | 1 | 2 | 3 | 4 | 5 | - * +------------+-------+-------+-------+-------+-------+ - * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | - * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | - * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | - * +------------+-------+-------+-------+-------+-------+ + * +-----------------------+-------+-------+-------+-------+-------+ + * | threads | 8 | 16 | 32 | 64 | 128 | + * +-----------------------+-------+-------+-------+-------+-------+ + * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | + * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | + * +-----------------------+-------+-------+-------+-------+-------+ */ - job.max_threads = num_node_state(N_MEMORY) * 2; - job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + if (hugepage_allocation_threads == 0) { + hugepage_allocation_threads = num_online_cpus() / 4; + hugepage_allocation_threads = max(hugepage_allocation_threads, 1); + } + + job.max_threads = hugepage_allocation_threads; + job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; + + jiffies_start = jiffies; padata_do_multithreaded(&job); + jiffies_end = jiffies; + + pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", + jiffies_to_msecs(jiffies_end - jiffies_start), + hugepage_allocation_threads); return h->nr_huge_pages; } @@ -3471,23 +3672,17 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; - static bool initialized __initdata; - /* skip gigantic hugepages allocation if hugetlb_cma enabled */ - if (hstate_is_gigantic(h) && hugetlb_cma_size) { + /* + * Skip gigantic hugepages allocation if early CMA + * reservations are not available. + */ + if (hstate_is_gigantic(h) && hugetlb_cma_total_size() && + !hugetlb_early_cma(h)) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); return; } - /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */ - if (!initialized) { - int i = 0; - - for (i = 0; i < MAX_NUMNODES; i++) - INIT_LIST_HEAD(&huge_boot_pages[i]); - initialized = true; - } - /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; @@ -3520,7 +3715,7 @@ static void __init hugetlb_init_hstates(void) */ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) continue; - if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) + if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; for_each_hstate(h2) { if (h2 == h) @@ -3535,13 +3730,20 @@ static void __init hugetlb_init_hstates(void) static void __init report_hugepages(void) { struct hstate *h; + unsigned long nrinvalid; for_each_hstate(h) { char buf[32]; + nrinvalid = hstate_boot_nrinvalid[hstate_index(h)]; + h->max_huge_pages -= nrinvalid; + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); + if (nrinvalid) + pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", + buf, nrinvalid, nrinvalid > 1 ? "s" : ""); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } @@ -4415,14 +4617,6 @@ static void hugetlb_register_all_nodes(void) { } #endif -#ifdef CONFIG_CMA -static void __init hugetlb_cma_check(void); -#else -static inline __init void hugetlb_cma_check(void) -{ -} -#endif - static void __init hugetlb_sysfs_init(void) { struct hstate *h; @@ -4547,8 +4741,6 @@ void __init hugetlb_add_hstate(unsigned int order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_memory_node; - h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/SZ_1K); @@ -4573,6 +4765,44 @@ static void __init hugepages_clear_pages_in_node(void) } } +static __init int hugetlb_add_param(char *s, int (*setup)(char *)) +{ + size_t len; + char *p; + + if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) + return -EINVAL; + + len = strlen(s) + 1; + if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf)) + return -EINVAL; + + p = &hstate_cmdline_buf[hstate_cmdline_index]; + memcpy(p, s, len); + hstate_cmdline_index += len; + + hugetlb_params[hugetlb_param_index].val = p; + hugetlb_params[hugetlb_param_index].setup = setup; + + hugetlb_param_index++; + + return 0; +} + +static __init void hugetlb_parse_params(void) +{ + int i; + struct hugetlb_cmdline *hcp; + + for (i = 0; i < hugetlb_param_index; i++) { + hcp = &hugetlb_params[i]; + + hcp->setup(hcp->val); + } + + hugetlb_cma_validate_params(); +} + /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz @@ -4592,7 +4822,7 @@ static int __init hugepages_setup(char *s) if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 1; + return -EINVAL; } /* @@ -4646,24 +4876,16 @@ static int __init hugepages_setup(char *s) } } - /* - * Global state is always initialized later in hugetlb_init. - * But we need to allocate gigantic hstates here early to still - * use the bootmem allocator. - */ - if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) - hugetlb_hstate_alloc_pages(parsed_hstate); - last_mhp = mhp; - return 1; + return 0; invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); - return 1; + return -EINVAL; } -__setup("hugepages=", hugepages_setup); +hugetlb_early_param("hugepages", hugepages_setup); /* * hugepagesz command line processing @@ -4682,7 +4904,7 @@ static int __init hugepagesz_setup(char *s) if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); - return 1; + return -EINVAL; } h = size_to_hstate(size); @@ -4697,7 +4919,7 @@ static int __init hugepagesz_setup(char *s) if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); - return 1; + return -EINVAL; } /* @@ -4707,14 +4929,14 @@ static int __init hugepagesz_setup(char *s) */ parsed_hstate = h; parsed_valid_hugepagesz = true; - return 1; + return 0; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; - return 1; + return 0; } -__setup("hugepagesz=", hugepagesz_setup); +hugetlb_early_param("hugepagesz", hugepagesz_setup); /* * default_hugepagesz command line input @@ -4728,14 +4950,14 @@ static int __init default_hugepagesz_setup(char *s) parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); - return 1; + return -EINVAL; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); - return 1; + return -EINVAL; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); @@ -4752,17 +4974,74 @@ static int __init default_hugepagesz_setup(char *s) */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; - for_each_online_node(i) - default_hstate.max_huge_pages_node[i] = - default_hugepages_in_node[i]; - if (hstate_is_gigantic(&default_hstate)) - hugetlb_hstate_alloc_pages(&default_hstate); + /* + * Since this is an early parameter, we can't check + * NUMA node state yet, so loop through MAX_NUMNODES. + */ + for (i = 0; i < MAX_NUMNODES; i++) { + if (default_hugepages_in_node[i] != 0) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; + } default_hstate_max_huge_pages = 0; } + return 0; +} +hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); + +static bool __hugetlb_bootmem_allocated __initdata; + +bool __init hugetlb_bootmem_allocated(void) +{ + return __hugetlb_bootmem_allocated; +} + +void __init hugetlb_bootmem_alloc(void) +{ + struct hstate *h; + int i; + + if (__hugetlb_bootmem_allocated) + return; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_LIST_HEAD(&huge_boot_pages[i]); + + hugetlb_parse_params(); + + for_each_hstate(h) { + h->next_nid_to_alloc = first_online_node; + h->next_nid_to_free = first_online_node; + + if (hstate_is_gigantic(h)) + hugetlb_hstate_alloc_pages(h); + } + + __hugetlb_bootmem_allocated = true; +} + +/* + * hugepage_alloc_threads command line parsing. + * + * When set, use this specific number of threads for the boot + * allocation of hugepages. + */ +static int __init hugepage_alloc_threads_setup(char *s) +{ + unsigned long allocation_threads; + + if (kstrtoul(s, 0, &allocation_threads) != 0) + return 1; + + if (allocation_threads == 0) + return 1; + + hugepage_allocation_threads = allocation_threads; + return 1; } -__setup("default_hugepagesz=", default_hugepagesz_setup); +__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup); static unsigned int allowed_mems_nr(struct hstate *h) { @@ -4900,7 +5179,7 @@ static const struct ctl_table hugetlb_table[] = { }, }; -static void hugetlb_sysctl_init(void) +static void __init hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } @@ -5447,7 +5726,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); @@ -5622,7 +5901,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); @@ -7613,163 +7892,3 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); } - -#ifdef CONFIG_CMA -static bool cma_reserve_called __initdata; - -static int __init cmdline_parse_hugetlb_cma(char *p) -{ - int nid, count = 0; - unsigned long tmp; - char *s = p; - - while (*s) { - if (sscanf(s, "%lu%n", &tmp, &count) != 1) - break; - - if (s[count] == ':') { - if (tmp >= MAX_NUMNODES) - break; - nid = array_index_nospec(tmp, MAX_NUMNODES); - - s += count + 1; - tmp = memparse(s, &s); - hugetlb_cma_size_in_node[nid] = tmp; - hugetlb_cma_size += tmp; - - /* - * Skip the separator if have one, otherwise - * break the parsing. - */ - if (*s == ',') - s++; - else - break; - } else { - hugetlb_cma_size = memparse(p, &p); - break; - } - } - - return 0; -} - -early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); - -void __init hugetlb_cma_reserve(int order) -{ - unsigned long size, reserved, per_node; - bool node_specific_cma_alloc = false; - int nid; - - /* - * HugeTLB CMA reservation is required for gigantic - * huge pages which could not be allocated via the - * page allocator. Just warn if there is any change - * breaking this assumption. - */ - VM_WARN_ON(order <= MAX_PAGE_ORDER); - cma_reserve_called = true; - - if (!hugetlb_cma_size) - return; - - for (nid = 0; nid < MAX_NUMNODES; nid++) { - if (hugetlb_cma_size_in_node[nid] == 0) - continue; - - if (!node_online(nid)) { - pr_warn("hugetlb_cma: invalid node %d specified\n", nid); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; - continue; - } - - if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", - nid, (PAGE_SIZE << order) / SZ_1M); - hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; - hugetlb_cma_size_in_node[nid] = 0; - } else { - node_specific_cma_alloc = true; - } - } - - /* Validate the CMA size again in case some invalid nodes specified. */ - if (!hugetlb_cma_size) - return; - - if (hugetlb_cma_size < (PAGE_SIZE << order)) { - pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", - (PAGE_SIZE << order) / SZ_1M); - hugetlb_cma_size = 0; - return; - } - - if (!node_specific_cma_alloc) { - /* - * If 3 GB area is requested on a machine with 4 numa nodes, - * let's allocate 1 GB on first three nodes and ignore the last one. - */ - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); - pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", - hugetlb_cma_size / SZ_1M, per_node / SZ_1M); - } - - reserved = 0; - for_each_online_node(nid) { - int res; - char name[CMA_MAX_NAME]; - - if (node_specific_cma_alloc) { - if (hugetlb_cma_size_in_node[nid] == 0) - continue; - - size = hugetlb_cma_size_in_node[nid]; - } else { - size = min(per_node, hugetlb_cma_size - reserved); - } - - size = round_up(size, PAGE_SIZE << order); - - snprintf(name, sizeof(name), "hugetlb%d", nid); - /* - * Note that 'order per bit' is based on smallest size that - * may be returned to CMA allocator in the case of - * huge page demotion. - */ - res = cma_declare_contiguous_nid(0, size, 0, - PAGE_SIZE << order, - HUGETLB_PAGE_ORDER, false, name, - &hugetlb_cma[nid], nid); - if (res) { - pr_warn("hugetlb_cma: reservation failed: err %d, node %d", - res, nid); - continue; - } - - reserved += size; - pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", - size / SZ_1M, nid); - - if (reserved >= hugetlb_cma_size) - break; - } - - if (!reserved) - /* - * hugetlb_cma_size is used to determine if allocations from - * cma are possible. Set to zero if no cma regions are set up. - */ - hugetlb_cma_size = 0; -} - -static void __init hugetlb_cma_check(void) -{ - if (!hugetlb_cma_size || cma_reserve_called) - return; - - pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); -} - -#endif /* CONFIG_CMA */ |