summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c337
1 files changed, 218 insertions, 119 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b298cba853ab..42c983821c03 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -100,6 +100,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
+/* If returns true, we are unable to access the VMA's folios. */
+static bool vma_is_special_huge(const struct vm_area_struct *vma)
+{
+ if (vma_is_dax(vma))
+ return false;
+ return vma_test_any(vma, VMA_PFNMAP_BIT, VMA_MIXEDMAP_BIT);
+}
+
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
enum tva_type type,
@@ -113,8 +121,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
/* Check the intersection of requested and supported orders. */
if (vma_is_anonymous(vma))
supported_orders = THP_ORDERS_ALL_ANON;
- else if (vma_is_special_huge(vma))
- supported_orders = THP_ORDERS_ALL_SPECIAL;
+ else if (vma_is_dax(vma) || vma_is_special_huge(vma))
+ supported_orders = THP_ORDERS_ALL_SPECIAL_DAX;
else
supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
@@ -316,30 +324,77 @@ static ssize_t enabled_show(struct kobject *kobj,
return sysfs_emit(buf, "%s\n", output);
}
+enum anon_enabled_mode {
+ ANON_ENABLED_ALWAYS = 0,
+ ANON_ENABLED_INHERIT = 1,
+ ANON_ENABLED_MADVISE = 2,
+ ANON_ENABLED_NEVER = 3,
+};
+
+static const char * const anon_enabled_mode_strings[] = {
+ [ANON_ENABLED_ALWAYS] = "always",
+ [ANON_ENABLED_INHERIT] = "inherit",
+ [ANON_ENABLED_MADVISE] = "madvise",
+ [ANON_ENABLED_NEVER] = "never",
+};
+
+enum global_enabled_mode {
+ GLOBAL_ENABLED_ALWAYS = 0,
+ GLOBAL_ENABLED_MADVISE = 1,
+ GLOBAL_ENABLED_NEVER = 2,
+};
+
+static const char * const global_enabled_mode_strings[] = {
+ [GLOBAL_ENABLED_ALWAYS] = "always",
+ [GLOBAL_ENABLED_MADVISE] = "madvise",
+ [GLOBAL_ENABLED_NEVER] = "never",
+};
+
+static bool set_global_enabled_mode(enum global_enabled_mode mode)
+{
+ static const unsigned long thp_flags[] = {
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ };
+ enum global_enabled_mode m;
+ bool changed = false;
+
+ for (m = 0; m < ARRAY_SIZE(thp_flags); m++) {
+ if (m == mode)
+ changed |= !test_and_set_bit(thp_flags[m],
+ &transparent_hugepage_flags);
+ else
+ changed |= test_and_clear_bit(thp_flags[m],
+ &transparent_hugepage_flags);
+ }
+
+ return changed;
+}
+
static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- ssize_t ret = count;
+ int mode;
- if (sysfs_streq(buf, "always")) {
- clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "madvise")) {
- clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (sysfs_streq(buf, "never")) {
- clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else
- ret = -EINVAL;
+ mode = sysfs_match_string(global_enabled_mode_strings, buf);
+ if (mode < 0)
+ return -EINVAL;
- if (ret > 0) {
+ if (set_global_enabled_mode(mode)) {
int err = start_stop_khugepaged();
+
if (err)
- ret = err;
+ return err;
+ } else {
+ /*
+ * Recalculate watermarks even when the mode didn't
+ * change, as the previous code always called
+ * start_stop_khugepaged() which does this internally.
+ */
+ set_recommended_min_free_kbytes();
}
- return ret;
+ return count;
}
static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
@@ -515,48 +570,54 @@ static ssize_t anon_enabled_show(struct kobject *kobj,
return sysfs_emit(buf, "%s\n", output);
}
+static bool set_anon_enabled_mode(int order, enum anon_enabled_mode mode)
+{
+ static unsigned long *enabled_orders[] = {
+ &huge_anon_orders_always,
+ &huge_anon_orders_inherit,
+ &huge_anon_orders_madvise,
+ };
+ enum anon_enabled_mode m;
+ bool changed = false;
+
+ spin_lock(&huge_anon_orders_lock);
+ for (m = 0; m < ARRAY_SIZE(enabled_orders); m++) {
+ if (m == mode)
+ changed |= !__test_and_set_bit(order, enabled_orders[m]);
+ else
+ changed |= __test_and_clear_bit(order, enabled_orders[m]);
+ }
+ spin_unlock(&huge_anon_orders_lock);
+
+ return changed;
+}
+
static ssize_t anon_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int order = to_thpsize(kobj)->order;
- ssize_t ret = count;
+ int mode;
- if (sysfs_streq(buf, "always")) {
- spin_lock(&huge_anon_orders_lock);
- clear_bit(order, &huge_anon_orders_inherit);
- clear_bit(order, &huge_anon_orders_madvise);
- set_bit(order, &huge_anon_orders_always);
- spin_unlock(&huge_anon_orders_lock);
- } else if (sysfs_streq(buf, "inherit")) {
- spin_lock(&huge_anon_orders_lock);
- clear_bit(order, &huge_anon_orders_always);
- clear_bit(order, &huge_anon_orders_madvise);
- set_bit(order, &huge_anon_orders_inherit);
- spin_unlock(&huge_anon_orders_lock);
- } else if (sysfs_streq(buf, "madvise")) {
- spin_lock(&huge_anon_orders_lock);
- clear_bit(order, &huge_anon_orders_always);
- clear_bit(order, &huge_anon_orders_inherit);
- set_bit(order, &huge_anon_orders_madvise);
- spin_unlock(&huge_anon_orders_lock);
- } else if (sysfs_streq(buf, "never")) {
- spin_lock(&huge_anon_orders_lock);
- clear_bit(order, &huge_anon_orders_always);
- clear_bit(order, &huge_anon_orders_inherit);
- clear_bit(order, &huge_anon_orders_madvise);
- spin_unlock(&huge_anon_orders_lock);
- } else
- ret = -EINVAL;
+ mode = sysfs_match_string(anon_enabled_mode_strings, buf);
+ if (mode < 0)
+ return -EINVAL;
- if (ret > 0) {
- int err;
+ if (set_anon_enabled_mode(order, mode)) {
+ int err = start_stop_khugepaged();
- err = start_stop_khugepaged();
if (err)
- ret = err;
+ return err;
+ } else {
+ /*
+ * Recalculate watermarks even when the mode didn't
+ * change, as the previous code always called
+ * start_stop_khugepaged() which does this internally.
+ */
+ set_recommended_min_free_kbytes();
}
- return ret;
+
+ return count;
}
static struct kobj_attribute anon_enabled_attr =
@@ -2341,17 +2402,87 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
mm_dec_nr_ptes(mm);
}
-int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t pmdval, struct folio *folio, bool is_present)
+{
+ const bool is_device_private = folio_is_device_private(folio);
+
+ /* Present and device private folios are rmappable. */
+ if (is_present || is_device_private)
+ folio_remove_rmap_pmd(folio, &folio->page, vma);
+
+ if (folio_test_anon(folio)) {
+ add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ } else {
+ add_mm_counter(mm, mm_counter_file(folio),
+ -HPAGE_PMD_NR);
+
+ if (is_present && pmd_young(pmdval) &&
+ likely(vma_has_recency(vma)))
+ folio_mark_accessed(folio);
+ }
+
+ /* Device private folios are pinned. */
+ if (is_device_private)
+ folio_put(folio);
+}
+
+static struct folio *normal_or_softleaf_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmdval, bool is_present)
+{
+ if (is_present)
+ return vm_normal_folio_pmd(vma, addr, pmdval);
+
+ if (!thp_migration_supported())
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+ return pmd_to_softleaf_folio(pmdval);
+}
+
+static bool has_deposited_pgtable(struct vm_area_struct *vma, pmd_t pmdval,
+ struct folio *folio)
+{
+ /* Some architectures require unconditional depositing. */
+ if (arch_needs_pgtable_deposit())
+ return true;
+
+ /*
+ * Huge zero always deposited except for DAX which handles itself, see
+ * set_huge_zero_folio().
+ */
+ if (is_huge_zero_pmd(pmdval))
+ return !vma_is_dax(vma);
+
+ /*
+ * Otherwise, only anonymous folios are deposited, see
+ * __do_huge_pmd_anonymous_page().
+ */
+ return folio && folio_test_anon(folio);
+}
+
+/**
+ * zap_huge_pmd - Zap a huge THP which is of PMD size.
+ * @tlb: The MMU gather TLB state associated with the operation.
+ * @vma: The VMA containing the range to zap.
+ * @pmd: A pointer to the leaf PMD entry.
+ * @addr: The virtual address for the range to zap.
+ *
+ * Returns: %true on success, %false otherwise.
+ */
+bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
- pmd_t orig_pmd;
+ struct mm_struct *mm = tlb->mm;
+ struct folio *folio = NULL;
+ bool is_present = false;
+ bool has_deposit;
spinlock_t *ptl;
+ pmd_t orig_pmd;
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
- return 0;
+ return false;
/*
* For architectures like ppc64 we look at deposited pgtable
* when calling pmdp_huge_get_and_clear. So do the
@@ -2362,64 +2493,19 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
- if (arch_needs_pgtable_deposit())
- zap_deposited_table(tlb->mm, pmd);
- spin_unlock(ptl);
- } else if (is_huge_zero_pmd(orig_pmd)) {
- if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
- zap_deposited_table(tlb->mm, pmd);
- spin_unlock(ptl);
- } else {
- struct folio *folio = NULL;
- int flush_needed = 1;
- if (pmd_present(orig_pmd)) {
- struct page *page = pmd_page(orig_pmd);
+ is_present = pmd_present(orig_pmd);
+ folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present);
+ has_deposit = has_deposited_pgtable(vma, orig_pmd, folio);
+ if (folio)
+ zap_huge_pmd_folio(mm, vma, orig_pmd, folio, is_present);
+ if (has_deposit)
+ zap_deposited_table(mm, pmd);
- folio = page_folio(page);
- folio_remove_rmap_pmd(folio, page, vma);
- WARN_ON_ONCE(folio_mapcount(folio) < 0);
- VM_BUG_ON_PAGE(!PageHead(page), page);
- } else if (pmd_is_valid_softleaf(orig_pmd)) {
- const softleaf_t entry = softleaf_from_pmd(orig_pmd);
-
- folio = softleaf_to_folio(entry);
- flush_needed = 0;
-
- if (!thp_migration_supported())
- WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
- }
-
- if (folio_test_anon(folio)) {
- zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- } else {
- if (arch_needs_pgtable_deposit())
- zap_deposited_table(tlb->mm, pmd);
- add_mm_counter(tlb->mm, mm_counter_file(folio),
- -HPAGE_PMD_NR);
-
- /*
- * Use flush_needed to indicate whether the PMD entry
- * is present, instead of checking pmd_present() again.
- */
- if (flush_needed && pmd_young(orig_pmd) &&
- likely(vma_has_recency(vma)))
- folio_mark_accessed(folio);
- }
-
- if (folio_is_device_private(folio)) {
- folio_remove_rmap_pmd(folio, &folio->page, vma);
- WARN_ON_ONCE(folio_mapcount(folio) < 0);
- folio_put(folio);
- }
-
- spin_unlock(ptl);
- if (flush_needed)
- tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
- }
- return 1;
+ spin_unlock(ptl);
+ if (is_present && folio)
+ tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
+ return true;
}
#ifndef pmd_move_must_withdraw
@@ -2864,7 +2950,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
+ if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
@@ -2972,7 +3058,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry;
- entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
+ entry = pfn_pte(zero_pfn(addr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
if (pmd_uffd_wp(old_pmd))
entry = pte_mkuffd_wp(entry);
@@ -3015,7 +3101,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (!vma_is_dax(vma) && vma_is_special_huge(vma))
+ if (vma_is_special_huge(vma))
return;
if (unlikely(pmd_is_migration_entry(old_pmd))) {
const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
@@ -4106,7 +4192,7 @@ out_unlock:
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas);
- if (old_order == HPAGE_PMD_ORDER)
+ if (is_pmd_order(old_order))
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
return ret;
@@ -4456,7 +4542,7 @@ retry:
goto next;
}
if (!folio_trylock(folio))
- goto next;
+ goto requeue;
if (!split_folio(folio)) {
did_split = true;
if (underused)
@@ -4465,13 +4551,18 @@ retry:
}
folio_unlock(folio);
next:
+ /*
+ * If thp_underused() returns false, or if split_folio()
+ * succeeds, or if split_folio() fails in the case it was
+ * underused, then consider it used and don't add it back to
+ * split_queue.
+ */
if (did_split || !folio_test_partially_mapped(folio))
continue;
+requeue:
/*
- * Only add back to the queue if folio is partially mapped.
- * If thp_underused returns false, or if split_folio fails
- * in the case it was underused, then consider it used and
- * don't add it back to split_queue.
+ * Add back partially mapped folios, or underused folios that
+ * we could not lock this round.
*/
fqueue = folio_split_queue_lock_irqsave(folio, &flags);
if (list_empty(&folio->_deferred_list)) {
@@ -4576,8 +4667,16 @@ next:
static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{
- return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
- is_vm_hugetlb_page(vma);
+ if (vma_is_dax(vma))
+ return true;
+ if (vma_is_special_huge(vma))
+ return true;
+ if (vma_test(vma, VMA_IO_BIT))
+ return true;
+ if (is_vm_hugetlb_page(vma))
+ return true;
+
+ return false;
}
static int split_huge_pages_pid(int pid, unsigned long vaddr_start,