summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c1102
1 files changed, 638 insertions, 464 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eaaec19caa7c..6fccfe6d046c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,9 +14,11 @@
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
+#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
+#include <linux/minmax.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
@@ -40,6 +42,7 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/setup.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
@@ -48,18 +51,33 @@
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
+#include "hugetlb_cma.h"
+#include <linux/page-isolation.h>
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
-#ifdef CONFIG_CMA
-static struct cma *hugetlb_cma[MAX_NUMNODES];
-static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-#endif
-static unsigned long hugetlb_cma_size __initdata;
-
__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
+static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
+
+/*
+ * Due to ordering constraints across the init code for various
+ * architectures, hugetlb hstate cmdline parameters can't simply
+ * be early_param. early_param might call the setup function
+ * before valid hugetlb page sizes are determined, leading to
+ * incorrect rejection of valid hugepagesz= options.
+ *
+ * So, record the parameters early and consume them whenever the
+ * init code is ready for them, by calling hugetlb_parse_params().
+ */
+
+/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
+#define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1)
+struct hugetlb_cmdline {
+ char *val;
+ int (*setup)(char *val);
+};
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
@@ -67,6 +85,21 @@ static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
+static unsigned long hugepage_allocation_threads __initdata;
+
+static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
+static int hstate_cmdline_index __initdata;
+static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
+static int hugetlb_param_index __initdata;
+static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
+static __init void hugetlb_parse_params(void);
+
+#define hugetlb_early_param(str, func) \
+static __init int func##args(char *s) \
+{ \
+ return hugetlb_add_param(s, func); \
+} \
+early_param(str, func##args)
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -92,12 +125,11 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static void hugetlb_free_folio(struct folio *folio)
{
-#ifdef CONFIG_CMA
- int nid = folio_nid(folio);
-
- if (cma_free_folio(hugetlb_cma[nid], folio))
+ if (folio_test_hugetlb_cma(folio)) {
+ hugetlb_cma_free_folio(folio);
return;
-#endif
+ }
+
folio_put(folio);
}
@@ -1246,69 +1278,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
hugetlb_dup_vma_private(vma);
}
-/* Returns true if the VMA has associated reserve pages */
-static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
-{
- if (vma->vm_flags & VM_NORESERVE) {
- /*
- * This address is already reserved by other process(chg == 0),
- * so, we should decrement reserved count. Without decrementing,
- * reserve count remains after releasing inode, because this
- * allocated page will go into page cache and is regarded as
- * coming from reserved pool in releasing step. Currently, we
- * don't have any other solution to deal with this situation
- * properly, so add work-around here.
- */
- if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return true;
- else
- return false;
- }
-
- /* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE) {
- /*
- * We know VM_NORESERVE is not set. Therefore, there SHOULD
- * be a region map for all pages. The only situation where
- * there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reserves to
- * use. This situation is indicated if chg != 0.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- /*
- * Like the shared case above, a hole punch or truncate
- * could have been performed on the private mapping.
- * Examine the value of chg to determine if reserves
- * actually exist or were previously consumed.
- * Very Subtle - The value of chg comes from a previous
- * call to vma_needs_reserves(). The reserve map for
- * private mappings has different (opposite) semantics
- * than that of shared mappings. vma_needs_reserves()
- * has already taken this difference in semantics into
- * account. Therefore, the meaning of chg is the same
- * as in the shared case above. Code could easily be
- * combined, but keeping it separate draws attention to
- * subtle differences.
- */
- if (chg)
- return false;
- else
- return true;
- }
-
- return false;
-}
-
static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
int nid = folio_nid(folio);
@@ -1336,6 +1305,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
if (folio_test_hwpoison(folio))
continue;
+ if (is_migrate_isolate_page(&folio->page))
+ continue;
+
list_move(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
folio_clear_hugetlb_freed(folio);
@@ -1394,8 +1366,7 @@ static unsigned long available_huge_pages(struct hstate *h)
static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve,
- long chg)
+ unsigned long address, long gbl_chg)
{
struct folio *folio = NULL;
struct mempolicy *mpol;
@@ -1404,15 +1375,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
int nid;
/*
- * A child process with MAP_PRIVATE mappings created by their parent
- * have no page reserves. This check ensures that reservations are
- * not "stolen". The child may still get SIGKILLed
+ * gbl_chg==1 means the allocation requires a new page that was not
+ * reserved before. Making sure there's at least one free page.
*/
- if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
- goto err;
-
- /* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && !available_huge_pages(h))
+ if (gbl_chg && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1430,11 +1396,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
nid, nodemask);
- if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
-
mpol_cond_put(mpol);
return folio;
@@ -1525,27 +1486,11 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
retry:
- folio = NULL;
-#ifdef CONFIG_CMA
- {
- int node;
-
- if (hugetlb_cma[nid])
- folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
-
- if (!folio && !(gfp_mask & __GFP_THISNODE)) {
- for_each_node_mask(node, *nodemask) {
- if (node == nid || !hugetlb_cma[node])
- continue;
-
- folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
- if (folio)
- break;
- }
- }
- }
-#endif
+ folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
if (!folio) {
+ if (hugetlb_cma_exclusive_alloc())
+ return NULL;
+
folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
if (!folio)
return NULL;
@@ -1704,7 +1649,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
folio_ref_unfreeze(folio, 1);
- INIT_LIST_HEAD(&folio->_deferred_list);
hugetlb_free_folio(folio);
}
@@ -2205,6 +2149,8 @@ retry:
if (!folio_ref_count(folio)) {
struct hstate *h = folio_hstate(folio);
+ bool adjust_surplus = false;
+
if (!available_huge_pages(h))
goto out;
@@ -2227,7 +2173,9 @@ retry:
goto retry;
}
- remove_hugetlb_folio(h, folio, false);
+ if (h->surplus_huge_pages_node[folio_nid(folio)])
+ adjust_surplus = true;
+ remove_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
@@ -2247,7 +2195,7 @@ retry:
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
+ add_hugetlb_folio(h, folio, adjust_surplus);
h->max_huge_pages++;
goto out;
}
@@ -2311,12 +2259,21 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
return NULL;
+ hugetlb_vmemmap_optimize_folio(h, folio);
+
spin_lock_irq(&hugetlb_lock);
/*
+ * nr_huge_pages needs to be adjusted within the same lock cycle
+ * as surplus_pages, otherwise it might confuse
+ * persistent_huge_pages() momentarily.
+ */
+ __prep_account_new_huge_page(h, nid);
+
+ /*
* We could have raced with the pool size change.
* Double check that and simply deallocate the new page
* if we would end up overcommiting the surpluses. Abuse
@@ -2463,7 +2420,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
long needed, allocated;
bool alloc_ok = true;
int node;
- nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ nodemask_t *mbind_nodemask, alloc_nodemask;
+
+ mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+ if (mbind_nodemask)
+ nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
+ else
+ alloc_nodemask = cpuset_current_mems_allowed;
lockdep_assert_held(&hugetlb_lock);
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2479,8 +2442,16 @@ retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
folio = NULL;
- for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
+
+ /* Prioritize current node */
+ if (node_isset(numa_mem_id(), alloc_nodemask))
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+ numa_mem_id(), NULL);
+
+ if (!folio) {
+ for_each_node_mask(node, alloc_nodemask) {
+ if (node == numa_mem_id())
+ continue;
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
node, NULL);
if (folio)
@@ -2868,7 +2839,7 @@ retry:
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- isolated = isolate_hugetlb(old_folio, list);
+ isolated = folio_isolate_hugetlb(old_folio, list);
ret = isolated ? 0 : -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
@@ -2953,7 +2924,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
+ if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
@@ -2961,69 +2932,137 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
return ret;
}
+/*
+ * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
+ * range with new folios.
+ * @start_pfn: start pfn of the given pfn range
+ * @end_pfn: end pfn of the given pfn range
+ * Returns 0 on success, otherwise negated error.
+ */
+int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct hstate *h;
+ struct folio *folio;
+ int ret = 0;
+
+ LIST_HEAD(isolate_list);
+
+ while (start_pfn < end_pfn) {
+ folio = pfn_folio(start_pfn);
+ if (folio_test_hugetlb(folio)) {
+ h = folio_hstate(folio);
+ } else {
+ start_pfn++;
+ continue;
+ }
+
+ if (!folio_ref_count(folio)) {
+ ret = alloc_and_dissolve_hugetlb_folio(h, folio,
+ &isolate_list);
+ if (ret)
+ break;
+
+ putback_movable_pages(&isolate_list);
+ }
+ start_pfn++;
+ }
+
+ return ret;
+}
+
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
+typedef enum {
+ /*
+ * For either 0/1: we checked the per-vma resv map, and one resv
+ * count either can be reused (0), or an extra needed (1).
+ */
+ MAP_CHG_REUSE = 0,
+ MAP_CHG_NEEDED = 1,
+ /*
+ * Cannot use per-vma resv count can be used, hence a new resv
+ * count is enforced.
+ *
+ * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
+ * that currently vma_needs_reservation() has an unwanted side
+ * effect to either use end() or commit() to complete the
+ * transaction. Hence it needs to differenciate from NEEDED.
+ */
+ MAP_CHG_ENFORCED = 2,
+} map_chg_state;
+
+/*
+ * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
+ * faults of hugetlb private mappings on top of a non-page-cache folio (in
+ * which case even if there's a private vma resv map it won't cover such
+ * allocation). New call sites should (probably) never set it to true!!
+ * When it's set, the allocation will bypass all vma level reservations.
+ */
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
+ unsigned long addr, bool cow_from_owner)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
- long gbl_chg;
- int memcg_charge_ret, ret, idx;
+ long retval, gbl_chg;
+ map_chg_state map_chg;
+ int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
- struct mem_cgroup *memcg;
- bool deferred_reserve;
gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
- memcg = get_mem_cgroup_from_current();
- memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
- if (memcg_charge_ret == -ENOMEM) {
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
- }
-
idx = hstate_index(h);
- /*
- * Examine the region/reserve map to determine if the process
- * has a reservation for the page to be allocated. A return
- * code of zero indicates a reservation exists (no change).
- */
- map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0) {
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
- return ERR_PTR(-ENOMEM);
+
+ /* Whether we need a separate per-vma reservation? */
+ if (cow_from_owner) {
+ /*
+ * Special case! Since it's a CoW on top of a reserved
+ * page, the private resv map doesn't count. So it cannot
+ * consume the per-vma resv map even if it's reserved.
+ */
+ map_chg = MAP_CHG_ENFORCED;
+ } else {
+ /*
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
+ */
+ retval = vma_needs_reservation(h, vma, addr);
+ if (retval < 0)
+ return ERR_PTR(-ENOMEM);
+ map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
}
/*
+ * Whether we need a separate global reservation?
+ *
* Processes that did not create the mapping will have no
* reserves as indicated by the region/reserve map. Check
* that the allocation will not exceed the subpool limit.
- * Allocations for MAP_NORESERVE mappings also need to be
- * checked against any subpool limit.
+ * Or if it can get one from the pool reservation directly.
*/
- if (map_chg || avoid_reserve) {
+ if (map_chg) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
if (gbl_chg < 0)
goto out_end_reservation;
-
+ } else {
/*
- * Even though there was no reservation in the region/reserve
- * map, there could be reservations associated with the
- * subpool that can be used. This would be indicated if the
- * return value of hugepage_subpool_get_pages() is zero.
- * However, if avoid_reserve is specified we still avoid even
- * the subpool reservations.
+ * If we have the vma reservation ready, no need for extra
+ * global reservation.
*/
- if (avoid_reserve)
- gbl_chg = 1;
+ gbl_chg = 0;
}
- /* If this allocation is not consuming a reservation, charge it now.
+ /*
+ * If this allocation is not consuming a per-vma reservation,
+ * charge the hugetlb cgroup now.
*/
- deferred_reserve = map_chg || avoid_reserve;
- if (deferred_reserve) {
+ if (map_chg) {
ret = hugetlb_cgroup_charge_cgroup_rsvd(
idx, pages_per_huge_page(h), &h_cg);
if (ret)
@@ -3040,27 +3079,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
if (!folio) {
spin_unlock_irq(&hugetlb_lock);
folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
if (!folio)
goto out_uncharge_cgroup;
spin_lock_irq(&hugetlb_lock);
- if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- folio_set_hugetlb_restore_reserve(folio);
- h->resv_huge_pages--;
- }
list_add(&folio->lru, &h->hugepage_activelist);
folio_ref_unfreeze(folio, 1);
/* Fall through */
}
+ /*
+ * Either dequeued or buddy-allocated folio needs to add special
+ * mark to the folio when it consumes a global reservation.
+ */
+ if (!gbl_chg) {
+ folio_set_hugetlb_restore_reserve(folio);
+ h->resv_huge_pages--;
+ }
+
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
- if (deferred_reserve) {
+ if (map_chg) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
h_cg, folio);
}
@@ -3069,53 +3113,114 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_set_folio_subpool(folio, spool);
- map_commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(map_chg > map_commit)) {
+ if (map_chg != MAP_CHG_ENFORCED) {
+ /* commit() is only needed if the map_chg is not enforced */
+ retval = vma_commit_reservation(h, vma, addr);
/*
+ * Check for possible race conditions. When it happens..
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
* This indicates a race with hugetlb_reserve_pages.
* Adjust for the subpool count incremented above AND
- * in hugetlb_reserve_pages for the same page. Also,
+ * in hugetlb_reserve_pages for the same page. Also,
* the reservation count added in hugetlb_reserve_pages
* no longer applies.
*/
- long rsv_adjust;
+ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
+ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1);
- hugetlb_acct_memory(h, -rsv_adjust);
- if (deferred_reserve) {
- spin_lock_irq(&hugetlb_lock);
- hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
- pages_per_huge_page(h), folio);
- spin_unlock_irq(&hugetlb_lock);
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ if (map_chg) {
+ spin_lock_irq(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_folio_rsvd(
+ hstate_index(h), pages_per_huge_page(h),
+ folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
}
}
- if (!memcg_charge_ret)
- mem_cgroup_commit_charge(folio, memcg);
+ ret = mem_cgroup_charge_hugetlb(folio, gfp);
+ /*
+ * Unconditionally increment NR_HUGETLB here. If it turns out that
+ * mem_cgroup_charge_hugetlb failed, then immediately free the page and
+ * decrement NR_HUGETLB.
+ */
lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
- mem_cgroup_put(memcg);
+
+ if (ret == -ENOMEM) {
+ free_huge_folio(folio);
+ return ERR_PTR(-ENOMEM);
+ }
return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
- if (deferred_reserve)
+ if (map_chg)
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg || avoid_reserve)
+ if (map_chg)
hugepage_subpool_put_pages(spool, 1);
out_end_reservation:
- vma_end_reservation(h, vma, addr);
- if (!memcg_charge_ret)
- mem_cgroup_cancel_charge(memcg, nr_pages);
- mem_cgroup_put(memcg);
+ if (map_chg != MAP_CHG_ENFORCED)
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
+static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
+{
+ struct huge_bootmem_page *m;
+ int listnode = nid;
+
+ if (hugetlb_early_cma(h))
+ m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
+ else {
+ if (node_exact)
+ m = memblock_alloc_exact_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ else {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h),
+ huge_page_size(h), 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ /*
+ * For pre-HVO to work correctly, pages need to be on
+ * the list for the node they were actually allocated
+ * from. That node may be different in the case of
+ * fallback by memblock_alloc_try_nid_raw. So,
+ * extract the actual node first.
+ */
+ if (m)
+ listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
+ }
+
+ if (m) {
+ m->flags = 0;
+ m->cma = NULL;
+ }
+ }
+
+ if (m) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ *
+ * Put them into a private list first because mem_map
+ * is not up yet.
+ */
+ INIT_LIST_HEAD(&m->list);
+ list_add(&m->list, &huge_boot_pages[listnode]);
+ m->hstate = h;
+ }
+
+ return m;
+}
+
int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
@@ -3125,22 +3230,15 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
- m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ m = alloc_bootmem(h, node, true);
if (!m)
return 0;
goto found;
}
+
/* allocate from next node when distributing huge pages */
- for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) {
- m = memblock_alloc_try_nid_raw(
- huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
+ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
+ m = alloc_bootmem(h, node, false);
if (!m)
return 0;
goto found;
@@ -3157,10 +3255,7 @@ found:
*/
memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
huge_page_size(h) - PAGE_SIZE);
- /* Put them into a private list first because mem_map is not up yet */
- INIT_LIST_HEAD(&m->list);
- list_add(&m->list, &huge_boot_pages[node]);
- m->hstate = h;
+
return 1;
}
@@ -3178,7 +3273,6 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
- __ClearPageReserved(folio_page(folio, pfn - head_pfn));
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
ret = page_ref_freeze(page, 1);
@@ -3202,6 +3296,42 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
prep_compound_head((struct page *)folio, huge_page_order(h));
}
+static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_HVO;
+}
+
+static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
+{
+ return m->flags & HUGE_BOOTMEM_CMA;
+}
+
+/*
+ * memblock-allocated pageblocks might not have the migrate type set
+ * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
+ * here, or MIGRATE_CMA if this was a page allocated through an early CMA
+ * reservation.
+ *
+ * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
+ * read-only, but that's ok - for sparse vmemmap this does not write to
+ * the page structure.
+ */
+static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
+ struct hstate *h)
+{
+ unsigned long nr_pages = pages_per_huge_page(h), i;
+
+ WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
+
+ for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
+ if (folio_test_hugetlb_cma(folio))
+ init_cma_pageblock(folio_page(folio, i));
+ else
+ set_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE);
+ }
+}
+
static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct list_head *folio_list)
{
@@ -3209,7 +3339,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
struct folio *folio, *tmp_f;
/* Send list for bulk vmemmap optimization processing */
- hugetlb_vmemmap_optimize_folios(h, folio_list);
+ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
@@ -3223,6 +3353,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
HUGETLB_VMEMMAP_RESERVE_PAGES,
pages_per_huge_page(h));
}
+ hugetlb_bootmem_init_migratetype(folio, h);
/* Subdivide locks to achieve better parallel performance */
spin_lock_irqsave(&hugetlb_lock, flags);
__prep_account_new_huge_page(h, folio_nid(folio));
@@ -3231,6 +3362,57 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
}
}
+bool __init hugetlb_bootmem_page_zones_valid(int nid,
+ struct huge_bootmem_page *m)
+{
+ unsigned long start_pfn;
+ bool valid;
+
+ if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
+ /*
+ * Already validated, skip check.
+ */
+ return true;
+ }
+
+ if (hugetlb_bootmem_page_earlycma(m)) {
+ valid = cma_validate_zones(m->cma);
+ goto out;
+ }
+
+ start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
+
+ valid = !pfn_range_intersects_zones(nid, start_pfn,
+ pages_per_huge_page(m->hstate));
+out:
+ if (!valid)
+ hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
+
+ return valid;
+}
+
+/*
+ * Free a bootmem page that was found to be invalid (intersecting with
+ * multiple zones).
+ *
+ * Since it intersects with multiple zones, we can't just do a free
+ * operation on all pages at once, but instead have to walk all
+ * pages, freeing them one by one.
+ */
+static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
+ struct hstate *h)
+{
+ unsigned long npages = pages_per_huge_page(h);
+ unsigned long pfn;
+
+ while (npages--) {
+ pfn = page_to_pfn(page);
+ __init_page_from_nid(pfn, nid);
+ free_reserved_page(page);
+ page++;
+ }
+}
+
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
@@ -3238,14 +3420,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
LIST_HEAD(folio_list);
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m, *tm;
struct hstate *h = NULL, *prev_h = NULL;
- list_for_each_entry(m, &huge_boot_pages[nid], list) {
+ list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
struct page *page = virt_to_page(m);
struct folio *folio = (void *)page;
h = m->hstate;
+ if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
+ /*
+ * Can't use this page. Initialize the
+ * page structures if that hasn't already
+ * been done, and give them to the page
+ * allocator.
+ */
+ hugetlb_bootmem_free_invalid_page(nid, page, h);
+ continue;
+ }
+
/*
* It is possible to have multiple huge page sizes (hstates)
* in this list. If so, process each size separately.
@@ -3260,14 +3453,30 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
hugetlb_folio_init_vmemmap(folio, h,
HUGETLB_VMEMMAP_RESERVE_PAGES);
init_new_hugetlb_folio(h, folio);
+
+ if (hugetlb_bootmem_page_prehvo(m))
+ /*
+ * If pre-HVO was done, just set the
+ * flag, the HVO code will then skip
+ * this folio.
+ */
+ folio_set_hugetlb_vmemmap_optimized(folio);
+
+ if (hugetlb_bootmem_page_earlycma(m))
+ folio_set_hugetlb_cma(folio);
+
list_add(&folio->lru, &folio_list);
/*
* We need to restore the 'stolen' pages to totalram_pages
* in order to fix confusing memory reports from free(1) and
* other side-effects, like CommitLimit going negative.
+ *
+ * For CMA pages, this is done in init_cma_pageblock
+ * (via hugetlb_bootmem_init_migratetype), so skip it here.
*/
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ if (!folio_test_hugetlb_cma(folio))
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
@@ -3289,7 +3498,7 @@ static void __init gather_bootmem_prealloc(void)
.thread_fn = gather_bootmem_prealloc_parallel,
.fn_arg = NULL,
.start = 0,
- .size = num_node_state(N_MEMORY),
+ .size = nr_node_ids,
.align = 1,
.min_chunk = 1,
.max_threads = num_node_state(N_MEMORY),
@@ -3407,32 +3616,44 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
.numa_aware = true
};
+ unsigned long jiffies_start;
+ unsigned long jiffies_end;
+
job.thread_fn = hugetlb_pages_alloc_boot_node;
job.start = 0;
job.size = h->max_huge_pages;
/*
- * job.max_threads is twice the num_node_state(N_MEMORY),
+ * job.max_threads is 25% of the available cpu threads by default.
*
- * Tests below indicate that a multiplier of 2 significantly improves
- * performance, and although larger values also provide improvements,
- * the gains are marginal.
+ * On large servers with terabytes of memory, huge page allocation
+ * can consume a considerably amount of time.
*
- * Therefore, choosing 2 as the multiplier strikes a good balance between
- * enhancing parallel processing capabilities and maintaining efficient
- * resource management.
+ * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
+ * 2MiB huge pages. Using more threads can significantly improve allocation time.
*
- * +------------+-------+-------+-------+-------+-------+
- * | multiplier | 1 | 2 | 3 | 4 | 5 |
- * +------------+-------+-------+-------+-------+-------+
- * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
- * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms |
- * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms |
- * +------------+-------+-------+-------+-------+-------+
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | threads | 8 | 16 | 32 | 64 | 128 |
+ * +-----------------------+-------+-------+-------+-------+-------+
+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s |
+ * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s |
+ * +-----------------------+-------+-------+-------+-------+-------+
*/
- job.max_threads = num_node_state(N_MEMORY) * 2;
- job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+ if (hugepage_allocation_threads == 0) {
+ hugepage_allocation_threads = num_online_cpus() / 4;
+ hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
+ }
+
+ job.max_threads = hugepage_allocation_threads;
+ job.min_chunk = h->max_huge_pages / hugepage_allocation_threads;
+
+ jiffies_start = jiffies;
padata_do_multithreaded(&job);
+ jiffies_end = jiffies;
+
+ pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
+ jiffies_to_msecs(jiffies_end - jiffies_start),
+ hugepage_allocation_threads);
return h->nr_huge_pages;
}
@@ -3451,23 +3672,17 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long allocated;
- static bool initialized __initdata;
- /* skip gigantic hugepages allocation if hugetlb_cma enabled */
- if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ /*
+ * Skip gigantic hugepages allocation if early CMA
+ * reservations are not available.
+ */
+ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
+ !hugetlb_early_cma(h)) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
return;
}
- /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
- if (!initialized) {
- int i = 0;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- INIT_LIST_HEAD(&huge_boot_pages[i]);
- initialized = true;
- }
-
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_specific_nodes(h))
return;
@@ -3500,7 +3715,7 @@ static void __init hugetlb_init_hstates(void)
*/
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
continue;
- if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
continue;
for_each_hstate(h2) {
if (h2 == h)
@@ -3515,13 +3730,20 @@ static void __init hugetlb_init_hstates(void)
static void __init report_hugepages(void)
{
struct hstate *h;
+ unsigned long nrinvalid;
for_each_hstate(h) {
char buf[32];
+ nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
+ h->max_huge_pages -= nrinvalid;
+
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
+ if (nrinvalid)
+ pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
+ buf, nrinvalid, nrinvalid > 1 ? "s" : "");
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -3806,13 +4028,15 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
struct page *page = folio_page(folio, i);
+ /* Careful: see __split_huge_page_tail() */
+ struct folio *new_folio = (struct folio *)page;
- page->mapping = NULL;
clear_compound_head(page);
prep_compound_page(page, dst->order);
- init_new_hugetlb_folio(dst, page_folio(page));
- list_add(&page->lru, &dst_list);
+ new_folio->mapping = NULL;
+ init_new_hugetlb_folio(dst, new_folio);
+ list_add(&new_folio->lru, &dst_list);
}
}
@@ -4393,14 +4617,6 @@ static void hugetlb_register_all_nodes(void) { }
#endif
-#ifdef CONFIG_CMA
-static void __init hugetlb_cma_check(void);
-#else
-static inline __init void hugetlb_cma_check(void)
-{
-}
-#endif
-
static void __init hugetlb_sysfs_init(void)
{
struct hstate *h;
@@ -4525,8 +4741,6 @@ void __init hugetlb_add_hstate(unsigned int order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_memory_node;
- h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/SZ_1K);
@@ -4551,6 +4765,44 @@ static void __init hugepages_clear_pages_in_node(void)
}
}
+static __init int hugetlb_add_param(char *s, int (*setup)(char *))
+{
+ size_t len;
+ char *p;
+
+ if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
+ return -EINVAL;
+
+ len = strlen(s) + 1;
+ if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
+ return -EINVAL;
+
+ p = &hstate_cmdline_buf[hstate_cmdline_index];
+ memcpy(p, s, len);
+ hstate_cmdline_index += len;
+
+ hugetlb_params[hugetlb_param_index].val = p;
+ hugetlb_params[hugetlb_param_index].setup = setup;
+
+ hugetlb_param_index++;
+
+ return 0;
+}
+
+static __init void hugetlb_parse_params(void)
+{
+ int i;
+ struct hugetlb_cmdline *hcp;
+
+ for (i = 0; i < hugetlb_param_index; i++) {
+ hcp = &hugetlb_params[i];
+
+ hcp->setup(hcp->val);
+ }
+
+ hugetlb_cma_validate_params();
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4570,7 +4822,7 @@ static int __init hugepages_setup(char *s)
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return -EINVAL;
}
/*
@@ -4624,24 +4876,16 @@ static int __init hugepages_setup(char *s)
}
}
- /*
- * Global state is always initialized later in hugetlb_init.
- * But we need to allocate gigantic hstates here early to still
- * use the bootmem allocator.
- */
- if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
- hugetlb_hstate_alloc_pages(parsed_hstate);
-
last_mhp = mhp;
- return 1;
+ return 0;
invalid:
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
hugepages_clear_pages_in_node();
- return 1;
+ return -EINVAL;
}
-__setup("hugepages=", hugepages_setup);
+hugetlb_early_param("hugepages", hugepages_setup);
/*
* hugepagesz command line processing
@@ -4660,7 +4904,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
h = size_to_hstate(size);
@@ -4675,7 +4919,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 1;
+ return -EINVAL;
}
/*
@@ -4685,14 +4929,14 @@ static int __init hugepagesz_setup(char *s)
*/
parsed_hstate = h;
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
-__setup("hugepagesz=", hugepagesz_setup);
+hugetlb_early_param("hugepagesz", hugepagesz_setup);
/*
* default_hugepagesz command line input
@@ -4706,14 +4950,14 @@ static int __init default_hugepagesz_setup(char *s)
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 1;
+ return -EINVAL;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 1;
+ return -EINVAL;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4730,17 +4974,74 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- for_each_online_node(i)
- default_hstate.max_huge_pages_node[i] =
- default_hugepages_in_node[i];
- if (hstate_is_gigantic(&default_hstate))
- hugetlb_hstate_alloc_pages(&default_hstate);
+ /*
+ * Since this is an early parameter, we can't check
+ * NUMA node state yet, so loop through MAX_NUMNODES.
+ */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (default_hugepages_in_node[i] != 0)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
+ }
default_hstate_max_huge_pages = 0;
}
+ return 0;
+}
+hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
+
+static bool __hugetlb_bootmem_allocated __initdata;
+
+bool __init hugetlb_bootmem_allocated(void)
+{
+ return __hugetlb_bootmem_allocated;
+}
+
+void __init hugetlb_bootmem_alloc(void)
+{
+ struct hstate *h;
+ int i;
+
+ if (__hugetlb_bootmem_allocated)
+ return;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&huge_boot_pages[i]);
+
+ hugetlb_parse_params();
+
+ for_each_hstate(h) {
+ h->next_nid_to_alloc = first_online_node;
+ h->next_nid_to_free = first_online_node;
+
+ if (hstate_is_gigantic(h))
+ hugetlb_hstate_alloc_pages(h);
+ }
+
+ __hugetlb_bootmem_allocated = true;
+}
+
+/*
+ * hugepage_alloc_threads command line parsing.
+ *
+ * When set, use this specific number of threads for the boot
+ * allocation of hugepages.
+ */
+static int __init hugepage_alloc_threads_setup(char *s)
+{
+ unsigned long allocation_threads;
+
+ if (kstrtoul(s, 0, &allocation_threads) != 0)
+ return 1;
+
+ if (allocation_threads == 0)
+ return 1;
+
+ hugepage_allocation_threads = allocation_threads;
+
return 1;
}
-__setup("default_hugepagesz=", default_hugepagesz_setup);
+__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);
static unsigned int allowed_mems_nr(struct hstate *h)
{
@@ -4845,7 +5146,7 @@ out:
return ret;
}
-static struct ctl_table hugetlb_table[] = {
+static const struct ctl_table hugetlb_table[] = {
{
.procname = "nr_hugepages",
.data = NULL,
@@ -5141,12 +5442,12 @@ const struct vm_operations_struct hugetlb_vm_ops = {
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
- int writable)
+ bool try_mkwrite)
{
pte_t entry;
unsigned int shift = huge_page_shift(hstate_vma(vma));
- if (writable) {
+ if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
vma->vm_page_prot)));
} else {
@@ -5169,6 +5470,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ if (vma->vm_flags & VM_WRITE)
+ set_huge_ptep_writable(vma, address, ptep);
+}
+
bool is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
@@ -5199,7 +5507,7 @@ static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
{
- pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
+ pte_t newpte = make_huge_pte(vma, &new_folio->page, true);
__folio_mark_uptodate(new_folio);
hugetlb_add_new_anon_rmap(new_folio, vma, addr);
@@ -5333,7 +5641,7 @@ again:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
if (IS_ERR(new_folio)) {
folio_put(pte_folio);
ret = PTR_ERR(new_folio);
@@ -5418,7 +5726,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
@@ -5593,7 +5901,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
- pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@@ -5799,7 +6107,7 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
struct hstate *h = hstate_vma(vma);
struct folio *old_folio;
struct folio *new_folio;
- int outside_reserve = 0;
+ bool cow_from_owner = 0;
vm_fault_t ret = 0;
struct mmu_notifier_range range;
@@ -5814,13 +6122,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
if (!unshare && huge_pte_uffd_wp(pte))
return 0;
- /*
- * hugetlb does not support FOLL_FORCE-style write faults that keep the
- * PTE mapped R/O such as maybe_mkwrite() would do.
- */
- if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
- return VM_FAULT_SIGSEGV;
-
/* Let's take out MAP_SHARED mappings first. */
if (vma->vm_flags & VM_MAYSHARE) {
set_huge_ptep_writable(vma, vmf->address, vmf->pte);
@@ -5849,7 +6150,8 @@ retry_avoidcopy:
SetPageAnonExclusive(&old_folio->page);
}
if (likely(!unshare))
- set_huge_ptep_writable(vma, vmf->address, vmf->pte);
+ set_huge_ptep_maybe_writable(vma, vmf->address,
+ vmf->pte);
delayacct_wpcopy_end();
return 0;
@@ -5868,7 +6170,7 @@ retry_avoidcopy:
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_folio != pagecache_folio)
- outside_reserve = 1;
+ cow_from_owner = true;
folio_get(old_folio);
@@ -5877,7 +6179,7 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(vmf->ptl);
- new_folio = alloc_hugetlb_folio(vma, vmf->address, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
if (IS_ERR(new_folio)) {
/*
@@ -5887,7 +6189,7 @@ retry_avoidcopy:
* reliability, unmap the page from child processes. The child
* may get SIGKILLed if it later faults.
*/
- if (outside_reserve) {
+ if (cow_from_owner) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx;
u32 hash;
@@ -6138,7 +6440,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
goto out;
}
- folio = alloc_hugetlb_folio(vma, vmf->address, 0);
+ folio = alloc_hugetlb_folio(vma, vmf->address, false);
if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
@@ -6235,8 +6537,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
- new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
- && (vma->vm_flags & VM_SHARED)));
+ new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
@@ -6568,7 +6869,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
spinlock_t *ptl;
int ret = -ENOMEM;
struct folio *folio;
- int writable;
bool folio_in_pagecache = false;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
@@ -6606,7 +6906,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
@@ -6648,7 +6948,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
goto out;
}
- folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
if (IS_ERR(folio)) {
folio_put(*foliop);
ret = -ENOMEM;
@@ -6722,12 +7022,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- if (wp_enabled || (is_continue && !vm_shared))
- writable = 0;
- else
- writable = dst_vma->vm_flags & VM_WRITE;
-
- _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
+ _dst_pte = make_huge_pte(dst_vma, &folio->page,
+ !wp_enabled && !(is_continue && !vm_shared));
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
* extremely important for hugetlbfs for now since swapping is not
@@ -7406,7 +7702,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-bool isolate_hugetlb(struct folio *folio, struct list_head *list)
+/**
+ * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
+ * @folio: the folio to isolate
+ * @list: the list to add the folio to on success
+ *
+ * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
+ * isolated/non-migratable, and moving it from the active list to the
+ * given list.
+ *
+ * Isolation will fail if @folio is not an allocated hugetlb folio, or if
+ * it is already isolated/non-migratable.
+ *
+ * On success, an additional folio reference is taken that must be dropped
+ * using folio_putback_hugetlb() to undo the isolation.
+ *
+ * Return: True if isolation worked, otherwise False.
+ */
+bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
bool ret = true;
@@ -7454,7 +7767,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
-void folio_putback_active_hugetlb(struct folio *folio)
+/**
+ * folio_putback_hugetlb - unisolate a hugetlb folio
+ * @folio: the isolated hugetlb folio
+ *
+ * Putback/un-isolate the hugetlb folio that was previous isolated using
+ * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
+ * back onto the active list.
+ *
+ * Will drop the additional folio reference obtained through
+ * folio_isolate_hugetlb().
+ */
+void folio_putback_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
folio_set_hugetlb_migratable(folio);
@@ -7501,6 +7825,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}
spin_unlock_irq(&hugetlb_lock);
}
+
+ /*
+ * Our old folio is isolated and has "migratable" cleared until it
+ * is putback. As migration succeeded, set the new folio "migratable"
+ * and add it to the active list.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ folio_set_hugetlb_migratable(new_folio);
+ list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
+ spin_unlock_irq(&hugetlb_lock);
}
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
@@ -7558,163 +7892,3 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
}
-
-#ifdef CONFIG_CMA
-static bool cma_reserve_called __initdata;
-
-static int __init cmdline_parse_hugetlb_cma(char *p)
-{
- int nid, count = 0;
- unsigned long tmp;
- char *s = p;
-
- while (*s) {
- if (sscanf(s, "%lu%n", &tmp, &count) != 1)
- break;
-
- if (s[count] == ':') {
- if (tmp >= MAX_NUMNODES)
- break;
- nid = array_index_nospec(tmp, MAX_NUMNODES);
-
- s += count + 1;
- tmp = memparse(s, &s);
- hugetlb_cma_size_in_node[nid] = tmp;
- hugetlb_cma_size += tmp;
-
- /*
- * Skip the separator if have one, otherwise
- * break the parsing.
- */
- if (*s == ',')
- s++;
- else
- break;
- } else {
- hugetlb_cma_size = memparse(p, &p);
- break;
- }
- }
-
- return 0;
-}
-
-early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
-
-void __init hugetlb_cma_reserve(int order)
-{
- unsigned long size, reserved, per_node;
- bool node_specific_cma_alloc = false;
- int nid;
-
- /*
- * HugeTLB CMA reservation is required for gigantic
- * huge pages which could not be allocated via the
- * page allocator. Just warn if there is any change
- * breaking this assumption.
- */
- VM_WARN_ON(order <= MAX_PAGE_ORDER);
- cma_reserve_called = true;
-
- if (!hugetlb_cma_size)
- return;
-
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- if (!node_online(nid)) {
- pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- continue;
- }
-
- if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
- nid, (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
- hugetlb_cma_size_in_node[nid] = 0;
- } else {
- node_specific_cma_alloc = true;
- }
- }
-
- /* Validate the CMA size again in case some invalid nodes specified. */
- if (!hugetlb_cma_size)
- return;
-
- if (hugetlb_cma_size < (PAGE_SIZE << order)) {
- pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
- (PAGE_SIZE << order) / SZ_1M);
- hugetlb_cma_size = 0;
- return;
- }
-
- if (!node_specific_cma_alloc) {
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
- }
-
- reserved = 0;
- for_each_online_node(nid) {
- int res;
- char name[CMA_MAX_NAME];
-
- if (node_specific_cma_alloc) {
- if (hugetlb_cma_size_in_node[nid] == 0)
- continue;
-
- size = hugetlb_cma_size_in_node[nid];
- } else {
- size = min(per_node, hugetlb_cma_size - reserved);
- }
-
- size = round_up(size, PAGE_SIZE << order);
-
- snprintf(name, sizeof(name), "hugetlb%d", nid);
- /*
- * Note that 'order per bit' is based on smallest size that
- * may be returned to CMA allocator in the case of
- * huge page demotion.
- */
- res = cma_declare_contiguous_nid(0, size, 0,
- PAGE_SIZE << order,
- HUGETLB_PAGE_ORDER, false, name,
- &hugetlb_cma[nid], nid);
- if (res) {
- pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
- res, nid);
- continue;
- }
-
- reserved += size;
- pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
- size / SZ_1M, nid);
-
- if (reserved >= hugetlb_cma_size)
- break;
- }
-
- if (!reserved)
- /*
- * hugetlb_cma_size is used to determine if allocations from
- * cma are possible. Set to zero if no cma regions are set up.
- */
- hugetlb_cma_size = 0;
-}
-
-static void __init hugetlb_cma_check(void)
-{
- if (!hugetlb_cma_size || cma_reserve_called)
- return;
-
- pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
-}
-
-#endif /* CONFIG_CMA */